From 067bfb884406d05722edec2d4fef82c2b65844ec Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 9 Mar 2023 17:47:14 +0000 Subject: [PATCH 0001/1072] Add new function generateRandomStructure --- .../functions/other-functions.md | 70 ++++ .../sql-reference/table-functions/generate.md | 14 + src/DataTypes/NestedUtils.cpp | 15 +- src/DataTypes/NestedUtils.h | 3 + src/Functions/generateRandomStructure.cpp | 343 ++++++++++++++++++ src/Storages/StorageGenerateRandom.cpp | 4 +- .../TableFunctionGenerateRandom.cpp | 6 +- .../02584_compressor_codecs.reference | 14 + .../0_stateless/02584_compressor_codecs.sh | 34 ++ .../02586_generate_random_structure.reference | 4 + .../02586_generate_random_structure.sql | 10 + 11 files changed, 512 insertions(+), 5 deletions(-) create mode 100644 src/Functions/generateRandomStructure.cpp create mode 100644 tests/queries/0_stateless/02584_compressor_codecs.reference create mode 100755 tests/queries/0_stateless/02584_compressor_codecs.sh create mode 100644 tests/queries/0_stateless/02586_generate_random_structure.reference create mode 100644 tests/queries/0_stateless/02586_generate_random_structure.sql diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 7146484361e..2f46df07b0a 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2609,3 +2609,73 @@ Result: │ 286 │ └──────────────────────────┘ ``` + +## generateRandomStructure + +Generates random table structure in a format `column1_name column1_type, column2_name column2_type, ...`. + +**Syntax** + +``` sql +generateRandomStructure([number_of_columns, seed]) +``` + +**Arguments** + +- `number_of_columns` — The desired number of columns in the result table structure. If set to 0, the number of columns will be random from 1 to 128. Default value - 0. +- `seed` - Random seed to produce stable results. If seed is not specified, it is randomly generated. + +All arguments must be constant. + +**Returned value** + +- Randomly generated table structure. + +Type: [String](../../sql-reference/data-types/string.md). + +**Examples** + +Query: + +``` sql +SELECT generateRandomStructure() +``` + +Result: + +``` text +┌─generateRandomStructure()─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ c1 Decimal32(5), c2 Date, c3 Tuple(LowCardinality(String), Int128, UInt64, UInt16, UInt8, IPv6), c4 Array(UInt128), c5 UInt32, c6 IPv4, c7 Decimal256(64), c8 Decimal128(3), c9 UInt256, c10 UInt64, c11 DateTime │ +└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT generateRandomStructure(1) +``` + +Result: + +``` text +┌─generateRandomStructure(1)─┐ +│ c1 Map(UInt256, UInt16) │ +└────────────────────────────┘ +``` + +Query: + +``` sql +SELECT generateRandomStructure(0, 11) +``` + +Result: + +``` text +┌─generateRandomStructure(0, 11)──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ c1 Date32, c2 String, c3 IPv6, c4 DateTime, c5 UInt16, c6 Tuple(e1 UInt32, e2 Date, e3 Date, e4 IPv6, e5 Nested(e1 DateTime, e2 FixedString(110), e3 Int256, e4 Array(Decimal64(4)), e5 Decimal128(18), e6 Enum16('v0' = 0, 'v1' = 1, 'v2' = 2, 'v3' = 3, 'v4' = 4)), e6 DateTime64(4)), c7 DateTime, c8 DateTime64(6), c9 Bool │ +└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +This function can be used together with [generateRandom](../../sql-reference/table-functions/generate.md) to generate completely random tables. + diff --git a/docs/en/sql-reference/table-functions/generate.md b/docs/en/sql-reference/table-functions/generate.md index b53ccdd42b5..7e39786c256 100644 --- a/docs/en/sql-reference/table-functions/generate.md +++ b/docs/en/sql-reference/table-functions/generate.md @@ -53,5 +53,19 @@ SELECT * FROM random; └──────────────────────────────┴──────────────┴────────────────────────────────────────────────────────────────────┘ ``` +In combination with [generateRandomStructure](../../sql-reference/functions/other-functions.md#generateRandomStructure): + +```sql +SELECT * FROM generateRandom(generateRandomStructure(3, 24), 24) LIMIT 3; +``` + +```text +┌─────────────────────────c1─┬─────c2─┬───────────────────c3─┬───────────────────────────────────────c4─┐ +│ 2085-07-05 23:48:43.345759 │ -20656 │ 1632406185424686785 │ -210464718903845545171230673454802.15238 │ +│ 1971-07-17 16:32:36.390777 │ -27071 │ -1553021742787219162 │ 1095158319964381336405161704296125.08074 │ +│ 2024-02-19 13:14:32.902513 │ 24913 │ 7727442383333447640 │ 1090748832613398997057187200834127.07109 │ +└────────────────────────────┴────────┴──────────────────────┴──────────────────────────────────────────┘ +``` + ## Related content - Blog: [Generating random data in ClickHouse](https://clickhouse.com/blog/generating-random-test-distribution-data-for-clickhouse) diff --git a/src/DataTypes/NestedUtils.cpp b/src/DataTypes/NestedUtils.cpp index f029ac6ba27..9ee803c4235 100644 --- a/src/DataTypes/NestedUtils.cpp +++ b/src/DataTypes/NestedUtils.cpp @@ -71,7 +71,7 @@ std::string extractTableName(const std::string & nested_name) } -Block flatten(const Block & block) +static Block flattenImpl(const Block & block, bool flatten_named_tuple) { Block res; @@ -114,7 +114,7 @@ Block flatten(const Block & block) else res.insert(elem); } - else if (const DataTypeTuple * type_tuple = typeid_cast(elem.type.get())) + else if (const DataTypeTuple * type_tuple = typeid_cast(elem.type.get()); type_tuple && flatten_named_tuple) { if (type_tuple->haveExplicitNames()) { @@ -143,6 +143,17 @@ Block flatten(const Block & block) return res; } +Block flatten(const Block & block) +{ + return flattenImpl(block, true); +} + + +Block flattenArrayOfTuples(const Block & block) +{ + return flattenImpl(block, false); +} + namespace { diff --git a/src/DataTypes/NestedUtils.h b/src/DataTypes/NestedUtils.h index 90fdd683493..e009ceb18fe 100644 --- a/src/DataTypes/NestedUtils.h +++ b/src/DataTypes/NestedUtils.h @@ -23,6 +23,9 @@ namespace Nested /// 2) For an Array with named Tuple element column, a Array(Tuple(x ..., y ..., ...)), replace it with multiple Array Columns, a.x ..., a.y ..., ... Block flatten(const Block & block); + /// Same as flatten but only for Array with named Tuple element column. + Block flattenArrayOfTuples(const Block & block); + /// Collect Array columns in a form of `column_name.element_name` to single Array(Tuple(...)) column. NamesAndTypesList collect(const NamesAndTypesList & names_and_types); diff --git a/src/Functions/generateRandomStructure.cpp b/src/Functions/generateRandomStructure.cpp new file mode 100644 index 00000000000..c27c8428cb0 --- /dev/null +++ b/src/Functions/generateRandomStructure.cpp @@ -0,0 +1,343 @@ +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int ILLEGAL_COLUMN; + extern const int BAD_ARGUMENTS; +} + +class FunctionGenerateRandomStructure : public IFunction +{ +private: + enum class SimpleTypes + { + Int8, + UInt8, + Bool, + Int16, + UInt16, + Int32, + UInt32, + Int64, + UInt64, + Int128, + UInt128, + Int256, + UInt256, + Float32, + Float64, + DateTime64, + Decimal32, + Decimal64, + Decimal128, + Decimal256, + Date, + Date32, + DateTime, + String, + FixedString, + Enum8, + Enum16, + IPv4, + IPv6, + }; + + enum class ComplexTypes + { + Nullable, + LowCardinality, + Array, + Tuple, + Map, + Nested, + }; + + enum class MapKeyTypes + { + Int8, + UInt8, + Bool, + Int16, + UInt16, + Int32, + UInt32, + Int64, + UInt64, + Int128, + UInt128, + Int256, + UInt256, + Date, + Date32, + DateTime, + String, + FixedString, + }; + + static constexpr size_t MAX_NUMBER_OF_COLUMNS = 128; + static constexpr size_t MAX_TUPLE_ELEMENTS = 16; + static constexpr size_t MAX_DATETIME64_PRECISION = 9; + static constexpr size_t MAX_DECIMAL32_PRECISION = 9; + static constexpr size_t MAX_DECIMAL64_PRECISION = 18; + static constexpr size_t MAX_DECIMAL128_PRECISION = 38; + static constexpr size_t MAX_DECIMAL256_PRECISION = 76; + static constexpr size_t MAX_DEPTH = 32; + +public: + static constexpr auto name = "generateRandomStructure"; + + static FunctionPtr create(ContextPtr /*context*/) + { + return std::make_shared(); + } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 0; } + + bool isVariadic() const override { return true; } + bool isDeterministic() const override { return false; } + bool isDeterministicInScopeOfQuery() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (arguments.size() > 2) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 0, 1 or 2.", + getName(), arguments.size()); + + if (arguments.size() > 1 && !isUnsignedInteger(arguments[0])) + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the first argument of function {}, expected unsigned integer", + arguments[0]->getName(), + getName()); + } + + if (arguments.size() > 2 && !isUnsignedInteger(arguments[1])) + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the second argument of function {}, expected unsigned integer", + arguments[1]->getName(), + getName()); + } + + return std::make_shared(); + } + + bool useDefaultImplementationForConstants() const override { return false; } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + size_t seed = randomSeed(); + size_t number_of_columns = 0; + + if (!arguments.empty()) + { + const auto & first_arg = arguments[0]; + + if (!isUnsignedInteger(first_arg.type)) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the first argument of function {}, expected unsigned integer", + first_arg.type->getName(), + getName()); + + number_of_columns = first_arg.column->getUInt(0); + if (number_of_columns > MAX_NUMBER_OF_COLUMNS) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Maximum allowed number of columns is {}, got {}", MAX_NUMBER_OF_COLUMNS, number_of_columns); + + if (arguments.size() == 2) + { + const auto & second_arg = arguments[1]; + + if (!isUnsignedInteger(second_arg.type)) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the second argument of function {}, expected unsigned integer", + second_arg.type->getName(), + getName()); + + seed = second_arg.column->getUInt(0); + } + } + + pcg64 rng(seed); + if (number_of_columns == 0) + number_of_columns = generateNumberOfColumns(rng); + + auto col_res = ColumnString::create(); + String generated_structure = ""; + for (size_t i = 0; i != number_of_columns; ++i) + { + if (i != 0) + generated_structure += ", "; + auto type = generateRandomType(rng); + generated_structure += "c" + std::to_string(i + 1) + " " + type; + } + col_res->insert(generated_structure); + return ColumnConst::create(std::move(col_res), input_rows_count); + } + +private: + + size_t generateNumberOfColumns(pcg64 & rng) const + { + return rng() % MAX_NUMBER_OF_COLUMNS + 1; + } + + String generateRandomType(pcg64 & rng, bool allow_complex_types = true, size_t depth = 0) const + { + constexpr size_t simple_types_size = magic_enum::enum_count(); + constexpr size_t complex_types_size = magic_enum::enum_count(); + size_t type_index; + if (allow_complex_types) + type_index = rng() % (simple_types_size + complex_types_size); + else + type_index = rng() % simple_types_size; + + if (type_index < simple_types_size) + { + auto type = magic_enum::enum_value(type_index); + switch (type) + { + case SimpleTypes::FixedString: + return "FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")"; + case SimpleTypes::DateTime64: + return "DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION) + ")"; + case SimpleTypes::Decimal32: + return "Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION) + ")"; + case SimpleTypes::Decimal64: + return "Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION) + ")"; + case SimpleTypes::Decimal128: + return "Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION) + ")"; + case SimpleTypes::Decimal256: + return "Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION) + ")"; + case SimpleTypes::Enum8: + return "Enum8(" + generateEnumValues(rng) + ")"; + case SimpleTypes::Enum16: + return "Enum16(" + generateEnumValues(rng) + ")"; + default: + return String(magic_enum::enum_name(type)); + } + } + + auto complex_type = magic_enum::enum_value(type_index - simple_types_size); + switch (complex_type) + { + case ComplexTypes::LowCardinality: + return "LowCardinality(" + generateLowCardinalityNestedType(rng) + ")"; + case ComplexTypes::Nullable: + return "Nullable(" + generateRandomType(rng, false, depth + 1) + ")"; + case ComplexTypes::Array: + return "Array(" + generateRandomType(rng, true, depth + 1) + ")"; + case ComplexTypes::Map: + return "Map(" + generateMapKeyType(rng) + ", " + generateRandomType(rng, true, depth + 1) + ")"; + case ComplexTypes::Tuple: + { + size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1; + bool named_tuple = rng() % 2; + String tuple_type = "Tuple("; + for (size_t i = 0; i != elements; ++i) + { + if (i != 0) + tuple_type += ", "; + if (named_tuple) + tuple_type += "e" + std::to_string(i + 1) + " "; + tuple_type += generateRandomType(rng, true, depth + 1); + } + return tuple_type + ")"; + } + case ComplexTypes::Nested: + { + size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1; + String nested_type = "Nested("; + for (size_t i = 0; i != elements; ++i) + { + if (i != 0) + nested_type += ", "; + nested_type += "e" + std::to_string(i + 1) + " " + generateRandomType(rng, true, depth + 1); + } + return nested_type + ")"; + } + } + } + + String generateMapKeyType(pcg64 & rng) const + { + constexpr size_t map_keys_types_size = magic_enum::enum_count(); + auto type = magic_enum::enum_value(rng() % map_keys_types_size); + if (type == MapKeyTypes::FixedString) + return "FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")"; + return String(magic_enum::enum_name(type)); + } + + String generateLowCardinalityNestedType(pcg64 & rng) const + { + /// Support only String and FixedString. + String nested_type; + if (rng() % 2) + nested_type = "String"; + else + nested_type = "FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")"; + return rng() % 2 ? nested_type : "Nullable(" + nested_type + ")"; + } + + String generateEnumValues(pcg64 & rng) const + { + /// Don't generate big enums, because it will lead to really big strings + /// and slowness of this function, and it can lead to `Max query size exceeded` + /// while using this function with generateRandom. + ssize_t num_values = rng() % 16 + 1; + String result; + for (ssize_t i = 0; i != num_values; ++i) + { + if (i != 0) + result += ", "; + result += "'v" + std::to_string(i) + "' = " + std::to_string(i); + } + return result; + } +}; + + +REGISTER_FUNCTION(GenerateRandomStructure) +{ + factory.registerFunction( + { + R"( +Generates a random table structure. +This function takes an optional constant argument, the number of column in the result structure. +If argument is now specified, the number of columns is random. The maximum number of columns is 1024. +The function returns a value of type String. +)", + Documentation::Examples{ + {"random", "SELECT generateRandomStructure()"}, + {"with specified number of arguments", "SELECT generateRandomStructure(10)"}}, + Documentation::Categories{"Random"} + }, + FunctionFactory::CaseSensitive); +} + +} diff --git a/src/Storages/StorageGenerateRandom.cpp b/src/Storages/StorageGenerateRandom.cpp index 601306bd1bf..f616313a595 100644 --- a/src/Storages/StorageGenerateRandom.cpp +++ b/src/Storages/StorageGenerateRandom.cpp @@ -422,7 +422,7 @@ class GenerateSource : public ISource { public: GenerateSource(UInt64 block_size_, UInt64 max_array_length_, UInt64 max_string_length_, UInt64 random_seed_, Block block_header_, ContextPtr context_) - : ISource(Nested::flatten(prepareBlockToFill(block_header_))) + : ISource(Nested::flattenArrayOfTuples(prepareBlockToFill(block_header_))) , block_size(block_size_), max_array_length(max_array_length_), max_string_length(max_string_length_) , block_to_fill(std::move(block_header_)), rng(random_seed_), context(context_) {} @@ -437,7 +437,7 @@ protected: for (const auto & elem : block_to_fill) columns.emplace_back(fillColumnWithRandomData(elem.type, block_size, max_array_length, max_string_length, rng, context)); - columns = Nested::flatten(block_to_fill.cloneWithColumns(columns)).getColumns(); + columns = Nested::flattenArrayOfTuples(block_to_fill.cloneWithColumns(columns)).getColumns(); return {std::move(columns), block_size}; } diff --git a/src/TableFunctions/TableFunctionGenerateRandom.cpp b/src/TableFunctions/TableFunctionGenerateRandom.cpp index 5f1a13d8857..12cbda334a3 100644 --- a/src/TableFunctions/TableFunctionGenerateRandom.cpp +++ b/src/TableFunctions/TableFunctionGenerateRandom.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include "registerTableFunctions.h" @@ -28,7 +29,7 @@ namespace ErrorCodes extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } -void TableFunctionGenerateRandom::parseArguments(const ASTPtr & ast_function, ContextPtr /*context*/) +void TableFunctionGenerateRandom::parseArguments(const ASTPtr & ast_function, ContextPtr context) { ASTs & args_func = ast_function->children; @@ -45,6 +46,9 @@ void TableFunctionGenerateRandom::parseArguments(const ASTPtr & ast_function, Co "Table function '{}' requires at most four arguments: " " structure, [random_seed, max_string_length, max_array_length].", getName()); + /// Allow constant expression for structure argument, it can be generated using generateRandomStructure function. + args[0] = evaluateConstantExpressionAsLiteral(args[0], context); + // All the arguments must be literals. for (const auto & arg : args) { diff --git a/tests/queries/0_stateless/02584_compressor_codecs.reference b/tests/queries/0_stateless/02584_compressor_codecs.reference new file mode 100644 index 00000000000..23751ef6c1f --- /dev/null +++ b/tests/queries/0_stateless/02584_compressor_codecs.reference @@ -0,0 +1,14 @@ +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/02584_compressor_codecs.sh b/tests/queries/0_stateless/02584_compressor_codecs.sh new file mode 100755 index 00000000000..930d101466b --- /dev/null +++ b/tests/queries/0_stateless/02584_compressor_codecs.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +echo "Hello, World!" > 02584_test_data + +$CLICKHOUSE_COMPRESSOR --codec 'Delta' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "BAD_ARGUMENTS"; +$CLICKHOUSE_COMPRESSOR --codec 'Delta(5)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'Delta([1,2])' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'Delta(4)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'; + +$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "BAD_ARGUMENTS"; +$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta(5)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta([1,2])' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta(4)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'; + +$CLICKHOUSE_COMPRESSOR --codec 'Gorilla' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "BAD_ARGUMENTS"; +$CLICKHOUSE_COMPRESSOR --codec 'Gorilla(5)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'Gorilla([1,2])' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'Gorilla(4)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'; + +$CLICKHOUSE_COMPRESSOR --codec 'FPC' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "BAD_ARGUMENTS"; +$CLICKHOUSE_COMPRESSOR --codec 'FPC(5)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "BAD_ARGUMENTS"; +$CLICKHOUSE_COMPRESSOR --codec 'FPC(5, 1)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'FPC([1,2,3])' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'FPC(5, 4)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'; + + +$CLICKHOUSE_COMPRESSOR --codec 'T64' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_SYNTAX_FOR_CODEC_TYPE"; + +rm 02584_test_data 02584_test_out + diff --git a/tests/queries/0_stateless/02586_generate_random_structure.reference b/tests/queries/0_stateless/02586_generate_random_structure.reference new file mode 100644 index 00000000000..335c5807c35 --- /dev/null +++ b/tests/queries/0_stateless/02586_generate_random_structure.reference @@ -0,0 +1,4 @@ +c1 Int256, c2 Bool, c3 Int16, c4 Map(Int64, Array(Bool)), c5 Enum16(\'v0\' = 0, \'v1\' = 1, \'v2\' = 2, \'v3\' = 3, \'v4\' = 4, \'v5\' = 5, \'v6\' = 6, \'v7\' = 7, \'v8\' = 8, \'v9\' = 9, \'v10\' = 10) +String +Const(String) +2085-07-05 23:48:43.345759 10105 1535011673144902513 diff --git a/tests/queries/0_stateless/02586_generate_random_structure.sql b/tests/queries/0_stateless/02586_generate_random_structure.sql new file mode 100644 index 00000000000..c67196569af --- /dev/null +++ b/tests/queries/0_stateless/02586_generate_random_structure.sql @@ -0,0 +1,10 @@ +select generateRandomStructure(5, 42); +select toTypeName(generateRandomStructure(5, 42)); +select toColumnTypeName(generateRandomStructure(5, 42)); +SELECT * FROM generateRandom(generateRandomStructure(3, 24), 24) LIMIT 1; + +select generateRandomStructure(5, 42, 42); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +select generateRandomStructure('5'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} +select generateRandomStructure(5, '42'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} +select generateRandomStructure(materialize(5), 42); -- {serverError ILLEGAL_COLUMN} +select generateRandomStructure(5, materialize(42)); -- {serverError ILLEGAL_COLUMN} From e5f6ced3d7d87a11e38f1a1499544fc7c2ad2e05 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 9 Mar 2023 18:01:37 +0000 Subject: [PATCH 0002/1072] Fix style --- src/Functions/generateRandomStructure.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Functions/generateRandomStructure.cpp b/src/Functions/generateRandomStructure.cpp index c27c8428cb0..11b6f7877e1 100644 --- a/src/Functions/generateRandomStructure.cpp +++ b/src/Functions/generateRandomStructure.cpp @@ -18,7 +18,6 @@ namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int ILLEGAL_COLUMN; extern const int BAD_ARGUMENTS; } @@ -242,7 +241,7 @@ private: return String(magic_enum::enum_name(type)); } } - + auto complex_type = magic_enum::enum_value(type_index - simple_types_size); switch (complex_type) { From 66eb06d8399144b40ac94a4b0944bb15b37ce2c9 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 9 Mar 2023 20:15:32 +0000 Subject: [PATCH 0003/1072] Better --- .../functions/other-functions.md | 23 +- src/Functions/generateRandomStructure.cpp | 340 ++++++++++++------ .../02586_generate_random_structure.reference | 7 +- .../02586_generate_random_structure.sql | 13 +- 4 files changed, 267 insertions(+), 116 deletions(-) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 2f46df07b0a..5f6b6e5687d 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2617,13 +2617,15 @@ Generates random table structure in a format `column1_name column1_type, column2 **Syntax** ``` sql -generateRandomStructure([number_of_columns, seed]) +generateRandomStructure([number_of_columns, seed, allow_big_numbers, allow_enums]) ``` **Arguments** -- `number_of_columns` — The desired number of columns in the result table structure. If set to 0, the number of columns will be random from 1 to 128. Default value - 0. -- `seed` - Random seed to produce stable results. If seed is not specified, it is randomly generated. +- `number_of_columns` — The desired number of columns in the result table structure. If set to 0 or `Null`, the number of columns will be random from 1 to 128. Default value: `Null`. +- `seed` - Random seed to produce stable results. If seed is not specified or set to `Null`, it is randomly generated. +- `allow_big_numbers` - Indicates if big number types (`Int128/UInt128/Int256/UInt256/Decimal128/Decinal256`) can be generated. Default value: true. +- `allow_enums` - Indicates if enum types can be generated. Default - true. All arguments must be constant. @@ -2666,7 +2668,7 @@ Result: Query: ``` sql -SELECT generateRandomStructure(0, 11) +SELECT generateRandomStructure(Null, 11) ``` Result: @@ -2677,5 +2679,18 @@ Result: └─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` +``` sql +SELECT generateRandomStructure(6, Null, false, false) +``` + +Result: + +``` text +┌─generateRandomStructure(6, NULL, false, false)───────────────────────────────────────────────────────┐ +│ c1 Float32, c2 Tuple(DateTime), c3 UInt8, c4 UInt16, c5 Int64, c6 Array(Map(FixedString(108), Date)) │ +└──────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + + This function can be used together with [generateRandom](../../sql-reference/table-functions/generate.md) to generate completely random tables. diff --git a/src/Functions/generateRandomStructure.cpp b/src/Functions/generateRandomStructure.cpp index 11b6f7877e1..f95b4a279de 100644 --- a/src/Functions/generateRandomStructure.cpp +++ b/src/Functions/generateRandomStructure.cpp @@ -24,7 +24,7 @@ namespace ErrorCodes class FunctionGenerateRandomStructure : public IFunction { private: - enum class SimpleTypes + enum class Type { Int8, UInt8, @@ -35,30 +35,26 @@ private: UInt32, Int64, UInt64, - Int128, - UInt128, - Int256, - UInt256, Float32, Float64, DateTime64, Decimal32, Decimal64, - Decimal128, - Decimal256, Date, Date32, DateTime, String, FixedString, - Enum8, - Enum16, IPv4, IPv6, - }; - - enum class ComplexTypes - { + Int128, + UInt128, + Int256, + UInt256, + Decimal128, + Decimal256, + Enum8, + Enum16, Nullable, LowCardinality, Array, @@ -66,27 +62,74 @@ private: Map, Nested, }; - - enum class MapKeyTypes + + static constexpr std::array simple_types { - Int8, - UInt8, - Bool, - Int16, - UInt16, - Int32, - UInt32, - Int64, - UInt64, - Int128, - UInt128, - Int256, - UInt256, - Date, - Date32, - DateTime, - String, - FixedString, + Type::Int8, + Type::UInt8, + Type::Bool, + Type::Int16, + Type::UInt16, + Type::Int32, + Type::UInt32, + Type::Int64, + Type::UInt64, + Type::Float32, + Type::Float64, + Type::DateTime64, + Type::Decimal32, + Type::Decimal64, + Type::Date, + Type::Date32, + Type::DateTime, + Type::String, + Type::FixedString, + Type::IPv4, + Type::IPv6, + }; + + static constexpr std::array big_number_types + { + Type::Int128, + Type::UInt128, + Type::Int256, + Type::UInt256, + Type::Decimal128, + Type::Decimal256, + }; + + static constexpr std::array enum_types + { + Type::Enum8, + Type::Enum16, + }; + + static constexpr std::array complex_types + { + Type::Nullable, + Type::LowCardinality, + Type::Array, + Type::Tuple, + Type::Map, + Type::Nested, + }; + + static constexpr std::array map_key_types + { + Type::Int8, + Type::UInt8, + Type::Bool, + Type::Int16, + Type::UInt16, + Type::Int32, + Type::UInt32, + Type::Int64, + Type::UInt64, + Type::Date, + Type::Date32, + Type::DateTime, + Type::String, + Type::FixedString, }; static constexpr size_t MAX_NUMBER_OF_COLUMNS = 128; @@ -114,48 +157,66 @@ public: bool isDeterministic() const override { return false; } bool isDeterministicInScopeOfQuery() const override { return false; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1, 2, 3}; } + bool useDefaultImplementationForConstants() const override { return false; } + bool useDefaultImplementationForNulls() const override { return false; } + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (arguments.size() > 2) + if (arguments.size() > 4) throw Exception( ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, should be 0, 1 or 2.", + "Number of arguments for function {} doesn't match: passed {}, expected from 0 to 4", getName(), arguments.size()); - if (arguments.size() > 1 && !isUnsignedInteger(arguments[0])) + if (!arguments.empty() && !isUnsignedInteger(arguments[0]) && !arguments[0]->onlyNull()) { throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the first argument of function {}, expected unsigned integer", + "Illegal type {} of the first argument of function {}, expected unsigned integer or Null", arguments[0]->getName(), getName()); } - if (arguments.size() > 2 && !isUnsignedInteger(arguments[1])) + if (arguments.size() > 1 && !isUnsignedInteger(arguments[1]) && !arguments[1]->onlyNull()) { throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the second argument of function {}, expected unsigned integer", + "Illegal type {} of the second argument of function {}, expected unsigned integer or Null", arguments[1]->getName(), getName()); } + if (arguments.size() > 2 && !isUInt8(arguments[2])) + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the third argument of function {}, expected UInt8", + arguments[2]->getName(), + getName()); + } + + if (arguments.size() > 3 && !isUInt8(arguments[3])) + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the fourth argument of function {}, expected UInt8", + arguments[3]->getName(), + getName()); + } + return std::make_shared(); } - bool useDefaultImplementationForConstants() const override { return false; } - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { size_t seed = randomSeed(); size_t number_of_columns = 0; - if (!arguments.empty()) + if (!arguments.empty() && !arguments[0].column->onlyNull()) { const auto & first_arg = arguments[0]; - if (!isUnsignedInteger(first_arg.type)) throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, @@ -165,21 +226,55 @@ public: number_of_columns = first_arg.column->getUInt(0); if (number_of_columns > MAX_NUMBER_OF_COLUMNS) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Maximum allowed number of columns is {}, got {}", MAX_NUMBER_OF_COLUMNS, number_of_columns); + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Maximum allowed number of columns is {}, got {}", + MAX_NUMBER_OF_COLUMNS, + number_of_columns); + } - if (arguments.size() == 2) - { - const auto & second_arg = arguments[1]; + if (arguments.size() > 1 && !arguments[1].column->onlyNull()) + { + const auto & second_arg = arguments[1]; - if (!isUnsignedInteger(second_arg.type)) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the second argument of function {}, expected unsigned integer", - second_arg.type->getName(), - getName()); + if (!isUnsignedInteger(second_arg.type)) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the second argument of function {}, expected unsigned integer", + second_arg.type->getName(), + getName()); - seed = second_arg.column->getUInt(0); - } + seed = second_arg.column->getUInt(0); + } + + bool allow_big_numbers = true; + if (arguments.size() > 2) + { + const auto & third_arg = arguments[2]; + + if (!isUInt8(third_arg.type)) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the second argument of function {}, expected UInt8", + third_arg.type->getName(), + getName()); + + allow_big_numbers = third_arg.column->getBool(0); + } + + bool allow_enums = true; + if (arguments.size() > 3) + { + const auto & fourth_arg = arguments[3]; + + if (!isUInt8(fourth_arg.type)) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the fourth argument of function {}, expected UInt8", + fourth_arg.type->getName(), + getName()); + + allow_enums = fourth_arg.column->getBool(0); } pcg64 rng(seed); @@ -192,7 +287,7 @@ public: { if (i != 0) generated_structure += ", "; - auto type = generateRandomType(rng); + auto type = generateRandomType(rng, allow_big_numbers, allow_enums); generated_structure += "c" + std::to_string(i + 1) + " " + type; } col_res->insert(generated_structure); @@ -205,55 +300,55 @@ private: { return rng() % MAX_NUMBER_OF_COLUMNS + 1; } - - String generateRandomType(pcg64 & rng, bool allow_complex_types = true, size_t depth = 0) const + + String generateRandomType(pcg64 & rng, bool allow_big_numbers, bool allow_enums) const { - constexpr size_t simple_types_size = magic_enum::enum_count(); - constexpr size_t complex_types_size = magic_enum::enum_count(); - size_t type_index; - if (allow_complex_types) - type_index = rng() % (simple_types_size + complex_types_size); - else - type_index = rng() % simple_types_size; - - if (type_index < simple_types_size) + if (allow_big_numbers) { - auto type = magic_enum::enum_value(type_index); - switch (type) - { - case SimpleTypes::FixedString: - return "FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")"; - case SimpleTypes::DateTime64: - return "DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION) + ")"; - case SimpleTypes::Decimal32: - return "Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION) + ")"; - case SimpleTypes::Decimal64: - return "Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION) + ")"; - case SimpleTypes::Decimal128: - return "Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION) + ")"; - case SimpleTypes::Decimal256: - return "Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION) + ")"; - case SimpleTypes::Enum8: - return "Enum8(" + generateEnumValues(rng) + ")"; - case SimpleTypes::Enum16: - return "Enum16(" + generateEnumValues(rng) + ")"; - default: - return String(magic_enum::enum_name(type)); - } + if (allow_enums) + return generateRandomTypeImpl(rng); + return generateRandomTypeImpl(rng); } - auto complex_type = magic_enum::enum_value(type_index - simple_types_size); - switch (complex_type) + if (allow_enums) + return generateRandomTypeImpl(rng); + return generateRandomTypeImpl(rng); + } + + + template + String generateRandomTypeImpl(pcg64 & rng, size_t depth = 0) const + { + constexpr auto all_types = getAllTypes(); + auto type = all_types[rng() % all_types.size()]; + + switch (type) { - case ComplexTypes::LowCardinality: + case Type::FixedString: + return "FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")"; + case Type::DateTime64: + return "DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION) + ")"; + case Type::Decimal32: + return "Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION) + ")"; + case Type::Decimal64: + return "Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION) + ")"; + case Type::Decimal128: + return "Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION) + ")"; + case Type::Decimal256: + return "Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION) + ")"; + case Type::Enum8: + return "Enum8(" + generateEnumValues(rng) + ")"; + case Type::Enum16: + return "Enum16(" + generateEnumValues(rng) + ")"; + case Type::LowCardinality: return "LowCardinality(" + generateLowCardinalityNestedType(rng) + ")"; - case ComplexTypes::Nullable: - return "Nullable(" + generateRandomType(rng, false, depth + 1) + ")"; - case ComplexTypes::Array: - return "Array(" + generateRandomType(rng, true, depth + 1) + ")"; - case ComplexTypes::Map: - return "Map(" + generateMapKeyType(rng) + ", " + generateRandomType(rng, true, depth + 1) + ")"; - case ComplexTypes::Tuple: + case Type::Nullable: + return "Nullable(" + generateRandomTypeImpl(rng, depth + 1) + ")"; + case Type::Array: + return "Array(" + generateRandomTypeImpl(rng, depth + 1) + ")"; + case Type::Map: + return "Map(" + generateMapKeyType(rng) + ", " + generateRandomTypeImpl(rng, depth + 1) + ")"; + case Type::Tuple: { size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1; bool named_tuple = rng() % 2; @@ -264,11 +359,11 @@ private: tuple_type += ", "; if (named_tuple) tuple_type += "e" + std::to_string(i + 1) + " "; - tuple_type += generateRandomType(rng, true, depth + 1); + tuple_type += generateRandomTypeImpl(rng, depth + 1); } return tuple_type + ")"; } - case ComplexTypes::Nested: + case Type::Nested: { size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1; String nested_type = "Nested("; @@ -276,20 +371,21 @@ private: { if (i != 0) nested_type += ", "; - nested_type += "e" + std::to_string(i + 1) + " " + generateRandomType(rng, true, depth + 1); + nested_type += "e" + std::to_string(i + 1) + " " + generateRandomTypeImpl(rng, depth + 1); } return nested_type + ")"; } + default: + return String(magic_enum::enum_name(type)); } } String generateMapKeyType(pcg64 & rng) const { - constexpr size_t map_keys_types_size = magic_enum::enum_count(); - auto type = magic_enum::enum_value(rng() % map_keys_types_size); - if (type == MapKeyTypes::FixedString) + auto type = map_key_types[rng() % map_key_types.size()]; + if (type == Type::FixedString) return "FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")"; - return String(magic_enum::enum_name(type)); + return String(magic_enum::enum_name(type)); } String generateLowCardinalityNestedType(pcg64 & rng) const @@ -318,6 +414,36 @@ private: } return result; } + + template + static constexpr auto getAllTypes() + { + constexpr size_t result_size = simple_types.size() + big_number_types.size() * allow_big_numbers + enum_types.size() * allow_enums + complex_types.size() * allow_complex_types; + std::array result; + size_t index = 0; + for (size_t i = 0; i != simple_types.size(); ++i, ++index) + result[index] = simple_types[i]; + + if constexpr (allow_big_numbers) + { + for (size_t i = 0; i != big_number_types.size(); ++i, ++index) + result[index] = big_number_types[i]; + } + + if constexpr (allow_enums) + { + for (size_t i = 0; i != enum_types.size(); ++i, ++index) + result[index] = enum_types[i]; + } + + if constexpr (allow_complex_types) + { + for (size_t i = 0; i != complex_types.size(); ++i, ++index) + result[index] = complex_types[i]; + } + + return result; + } }; diff --git a/tests/queries/0_stateless/02586_generate_random_structure.reference b/tests/queries/0_stateless/02586_generate_random_structure.reference index 335c5807c35..181b7f0ec6f 100644 --- a/tests/queries/0_stateless/02586_generate_random_structure.reference +++ b/tests/queries/0_stateless/02586_generate_random_structure.reference @@ -1,4 +1,7 @@ -c1 Int256, c2 Bool, c3 Int16, c4 Map(Int64, Array(Bool)), c5 Enum16(\'v0\' = 0, \'v1\' = 1, \'v2\' = 2, \'v3\' = 3, \'v4\' = 4, \'v5\' = 5, \'v6\' = 6, \'v7\' = 7, \'v8\' = 8, \'v9\' = 9, \'v10\' = 10) +c1 DateTime64(1), c2 Int16, c3 Map(Int64, Array(Bool)), c4 Decimal256(30), c5 Int128 +c1 Date, c2 Float64, c3 DateTime, c4 Map(Int64, DateTime64(4)), c5 Nested(e1 LowCardinality(String), e2 UInt32, e3 Enum16(\'v0\' = 0, \'v1\' = 1, \'v2\' = 2, \'v3\' = 3, \'v4\' = 4, \'v5\' = 5, \'v6\' = 6, \'v7\' = 7), e4 Float32, e5 Date32, e6 Int64, e7 Decimal64(0), e8 UInt16, e9 Date32, e10 Int64, e11 DateTime64(0)) +c1 Nested(e1 Int64, e2 Int16, e3 Map(Int16, LowCardinality(Nullable(String))), e4 UInt8, e5 Nested(e1 Array(Nullable(Decimal64(12))), e2 DateTime64(1), e3 UInt64, e4 FixedString(61), e5 Decimal64(13), e6 UInt8), e6 Int8), c2 DateTime64(5), c3 IPv4, c4 String, c5 String +c1 DateTime64(1), c2 IPv4, c3 Nullable(Decimal128(37)), c4 UInt128, c5 Date String Const(String) -2085-07-05 23:48:43.345759 10105 1535011673144902513 +2106-02-02 121 17265 diff --git a/tests/queries/0_stateless/02586_generate_random_structure.sql b/tests/queries/0_stateless/02586_generate_random_structure.sql index c67196569af..b524f6a5ff1 100644 --- a/tests/queries/0_stateless/02586_generate_random_structure.sql +++ b/tests/queries/0_stateless/02586_generate_random_structure.sql @@ -1,10 +1,17 @@ select generateRandomStructure(5, 42); +select generateRandomStructure(5, 42, false); +select generateRandomStructure(5, 42, false, false); +select generateRandomStructure(5, 42, true, false); select toTypeName(generateRandomStructure(5, 42)); select toColumnTypeName(generateRandomStructure(5, 42)); SELECT * FROM generateRandom(generateRandomStructure(3, 24), 24) LIMIT 1; -select generateRandomStructure(5, 42, 42); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +select generateRandomStructure(5, 42, false, false, 42); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} select generateRandomStructure('5'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} select generateRandomStructure(5, '42'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} -select generateRandomStructure(materialize(5), 42); -- {serverError ILLEGAL_COLUMN} -select generateRandomStructure(5, materialize(42)); -- {serverError ILLEGAL_COLUMN} +select generateRandomStructure(5, 42, 'false'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} +select generateRandomStructure(5, 42, false, 'false'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} +select generateRandomStructure(materialize(5), 42, false, false); -- {serverError ILLEGAL_COLUMN} +select generateRandomStructure(5, materialize(42), false, false); -- {serverError ILLEGAL_COLUMN} +select generateRandomStructure(5, 42, materialize(false), false); -- {serverError ILLEGAL_COLUMN} +select generateRandomStructure(5, 42, false, materialize(false)); -- {serverError ILLEGAL_COLUMN} From 0d430de54fef53e61f821d8b4bae684c816d4f2a Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 9 Mar 2023 20:18:17 +0000 Subject: [PATCH 0004/1072] Better --- src/Functions/generateRandomStructure.cpp | 31 ----------------------- 1 file changed, 31 deletions(-) diff --git a/src/Functions/generateRandomStructure.cpp b/src/Functions/generateRandomStructure.cpp index f95b4a279de..3b42fd99fb4 100644 --- a/src/Functions/generateRandomStructure.cpp +++ b/src/Functions/generateRandomStructure.cpp @@ -217,13 +217,6 @@ public: if (!arguments.empty() && !arguments[0].column->onlyNull()) { const auto & first_arg = arguments[0]; - if (!isUnsignedInteger(first_arg.type)) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the first argument of function {}, expected unsigned integer", - first_arg.type->getName(), - getName()); - number_of_columns = first_arg.column->getUInt(0); if (number_of_columns > MAX_NUMBER_OF_COLUMNS) throw Exception( @@ -236,14 +229,6 @@ public: if (arguments.size() > 1 && !arguments[1].column->onlyNull()) { const auto & second_arg = arguments[1]; - - if (!isUnsignedInteger(second_arg.type)) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the second argument of function {}, expected unsigned integer", - second_arg.type->getName(), - getName()); - seed = second_arg.column->getUInt(0); } @@ -251,14 +236,6 @@ public: if (arguments.size() > 2) { const auto & third_arg = arguments[2]; - - if (!isUInt8(third_arg.type)) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the second argument of function {}, expected UInt8", - third_arg.type->getName(), - getName()); - allow_big_numbers = third_arg.column->getBool(0); } @@ -266,14 +243,6 @@ public: if (arguments.size() > 3) { const auto & fourth_arg = arguments[3]; - - if (!isUInt8(fourth_arg.type)) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the fourth argument of function {}, expected UInt8", - fourth_arg.type->getName(), - getName()); - allow_enums = fourth_arg.column->getBool(0); } From 746d12e7ccf4780d6887aa3f961605b7f313b77b Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 9 Mar 2023 20:19:28 +0000 Subject: [PATCH 0005/1072] Remove wrong tests --- .../02584_compressor_codecs.reference | 14 -------- .../0_stateless/02584_compressor_codecs.sh | 34 ------------------- 2 files changed, 48 deletions(-) delete mode 100644 tests/queries/0_stateless/02584_compressor_codecs.reference delete mode 100755 tests/queries/0_stateless/02584_compressor_codecs.sh diff --git a/tests/queries/0_stateless/02584_compressor_codecs.reference b/tests/queries/0_stateless/02584_compressor_codecs.reference deleted file mode 100644 index 23751ef6c1f..00000000000 --- a/tests/queries/0_stateless/02584_compressor_codecs.reference +++ /dev/null @@ -1,14 +0,0 @@ -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 diff --git a/tests/queries/0_stateless/02584_compressor_codecs.sh b/tests/queries/0_stateless/02584_compressor_codecs.sh deleted file mode 100755 index 930d101466b..00000000000 --- a/tests/queries/0_stateless/02584_compressor_codecs.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CURDIR"/../shell_config.sh - -echo "Hello, World!" > 02584_test_data - -$CLICKHOUSE_COMPRESSOR --codec 'Delta' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "BAD_ARGUMENTS"; -$CLICKHOUSE_COMPRESSOR --codec 'Delta(5)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; -$CLICKHOUSE_COMPRESSOR --codec 'Delta([1,2])' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; -$CLICKHOUSE_COMPRESSOR --codec 'Delta(4)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'; - -$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "BAD_ARGUMENTS"; -$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta(5)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; -$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta([1,2])' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; -$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta(4)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'; - -$CLICKHOUSE_COMPRESSOR --codec 'Gorilla' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "BAD_ARGUMENTS"; -$CLICKHOUSE_COMPRESSOR --codec 'Gorilla(5)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; -$CLICKHOUSE_COMPRESSOR --codec 'Gorilla([1,2])' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; -$CLICKHOUSE_COMPRESSOR --codec 'Gorilla(4)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'; - -$CLICKHOUSE_COMPRESSOR --codec 'FPC' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "BAD_ARGUMENTS"; -$CLICKHOUSE_COMPRESSOR --codec 'FPC(5)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "BAD_ARGUMENTS"; -$CLICKHOUSE_COMPRESSOR --codec 'FPC(5, 1)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; -$CLICKHOUSE_COMPRESSOR --codec 'FPC([1,2,3])' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; -$CLICKHOUSE_COMPRESSOR --codec 'FPC(5, 4)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'; - - -$CLICKHOUSE_COMPRESSOR --codec 'T64' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_SYNTAX_FOR_CODEC_TYPE"; - -rm 02584_test_data 02584_test_out - From 48d701f0e70a10807d64eb041dcdc17562cf6731 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 9 Mar 2023 20:27:14 +0000 Subject: [PATCH 0006/1072] Better docs --- src/Functions/generateRandomStructure.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/Functions/generateRandomStructure.cpp b/src/Functions/generateRandomStructure.cpp index 3b42fd99fb4..0fb9ef0f2f3 100644 --- a/src/Functions/generateRandomStructure.cpp +++ b/src/Functions/generateRandomStructure.cpp @@ -422,13 +422,19 @@ REGISTER_FUNCTION(GenerateRandomStructure) { R"( Generates a random table structure. -This function takes an optional constant argument, the number of column in the result structure. -If argument is now specified, the number of columns is random. The maximum number of columns is 1024. +This function takes 4 optional constant arguments: the number of column in the result structure (random by default), +random seed (random by default), flag that indicates if big number types can be used (true by default), +flag that indicates if enum types can be used (true by default). +The maximum number of columns is 128. The function returns a value of type String. )", Documentation::Examples{ {"random", "SELECT generateRandomStructure()"}, - {"with specified number of arguments", "SELECT generateRandomStructure(10)"}}, + {"with specified number of arguments", "SELECT generateRandomStructure(10)"}, + {"with specified seed", "SELECT generateRandomStructure(10, 42)"}, + {"without big number types", "SELECT generateRandomStructure(10, NULL, false)"}, + {"without enum types", "SELECT generateRandomStructure(10, NULL, false, false)"}, + }, Documentation::Categories{"Random"} }, FunctionFactory::CaseSensitive); From f9d9b1ee2379b608a3f13bb6e0087e2e4b292ba4 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 10 Mar 2023 16:16:28 +0000 Subject: [PATCH 0007/1072] Add more options --- .../functions/other-functions.md | 19 +- src/Functions/generateRandomStructure.cpp | 262 +++++++++++------- .../02586_generate_random_structure.reference | 14 +- .../02586_generate_random_structure.sql | 16 +- 4 files changed, 201 insertions(+), 110 deletions(-) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 5f6b6e5687d..b4664f75e67 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2617,7 +2617,7 @@ Generates random table structure in a format `column1_name column1_type, column2 **Syntax** ``` sql -generateRandomStructure([number_of_columns, seed, allow_big_numbers, allow_enums]) +generateRandomStructure([number_of_columns, seed, allow_big_numbers, allow_enums, allow_decimals, allow_ip, allow_only_string_map_keys]) ``` **Arguments** @@ -2625,7 +2625,10 @@ generateRandomStructure([number_of_columns, seed, allow_big_numbers, allow_enums - `number_of_columns` — The desired number of columns in the result table structure. If set to 0 or `Null`, the number of columns will be random from 1 to 128. Default value: `Null`. - `seed` - Random seed to produce stable results. If seed is not specified or set to `Null`, it is randomly generated. - `allow_big_numbers` - Indicates if big number types (`Int128/UInt128/Int256/UInt256/Decimal128/Decinal256`) can be generated. Default value: true. -- `allow_enums` - Indicates if enum types can be generated. Default - true. +- `allow_enums` - Indicates if enum types (`Enum8/Enum16`) can be generated. Default - true. +- `allow_decimals` - Indicates if decimal types (`Decimal(P, S)`) can be generated. Default - true. +- `allow_ip` - Indicates if ip types (`IPv4/IPv6`) can be generated. Default - true. +- `allow_only_string_map_keys` - Indicates if Map key type can be only `String/FixedString`. Default - false. All arguments must be constant. @@ -2691,6 +2694,18 @@ Result: └──────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` +``` sql +SELECT generateRandomStructure(6, Null, false, false, false, false, true) +``` + +Result: + +``` text +┌─generateRandomStructure(6, NULL, false, false, false, false, true)─────────────────────────────────────────────────┐ +│ c1 String, c2 UInt32, c3 Int32, c4 Int32, c5 Tuple(LowCardinality(Nullable(FixedString(101))), UInt8), c6 DateTime │ +└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + This function can be used together with [generateRandom](../../sql-reference/table-functions/generate.md) to generate completely random tables. diff --git a/src/Functions/generateRandomStructure.cpp b/src/Functions/generateRandomStructure.cpp index 0fb9ef0f2f3..e6766e731b2 100644 --- a/src/Functions/generateRandomStructure.cpp +++ b/src/Functions/generateRandomStructure.cpp @@ -62,8 +62,8 @@ private: Map, Nested, }; - - static constexpr std::array simple_types + + static constexpr std::array simple_types { Type::Int8, Type::UInt8, @@ -76,24 +76,30 @@ private: Type::UInt64, Type::Float32, Type::Float64, - Type::DateTime64, - Type::Decimal32, - Type::Decimal64, Type::Date, Type::Date32, Type::DateTime, Type::String, Type::FixedString, - Type::IPv4, - Type::IPv6, }; - static constexpr std::array big_number_types + static constexpr std::array big_integer_types { Type::Int128, Type::UInt128, Type::Int256, Type::UInt256, + }; + + static constexpr std::array decimal_types + { + Type::DateTime64, + Type::Decimal32, + Type::Decimal64, + }; + + static constexpr std::array big_decimal_types + { Type::Decimal128, Type::Decimal256, }; @@ -104,6 +110,12 @@ private: Type::Enum16, }; + static constexpr std::array ip_types + { + Type::IPv4, + Type::IPv6, + }; + static constexpr std::array complex_types { Type::Nullable, @@ -132,6 +144,12 @@ private: Type::FixedString, }; + static constexpr std::array map_key_string_types + { + Type::String, + Type::FixedString + }; + static constexpr size_t MAX_NUMBER_OF_COLUMNS = 128; static constexpr size_t MAX_TUPLE_ELEMENTS = 16; static constexpr size_t MAX_DATETIME64_PRECISION = 9; @@ -157,53 +175,48 @@ public: bool isDeterministic() const override { return false; } bool isDeterministicInScopeOfQuery() const override { return false; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1, 2, 3}; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1, 2, 3, 4, 5, 6}; } bool useDefaultImplementationForConstants() const override { return false; } bool useDefaultImplementationForNulls() const override { return false; } - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (arguments.size() > 4) + if (arguments.size() > 7) throw Exception( ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, expected from 0 to 4", + "Number of arguments for function {} doesn't match: passed {}, expected from 0 to 7", getName(), arguments.size()); - if (!arguments.empty() && !isUnsignedInteger(arguments[0]) && !arguments[0]->onlyNull()) + for (size_t i = 0; i != 2; ++i) { - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the first argument of function {}, expected unsigned integer or Null", - arguments[0]->getName(), - getName()); + if (arguments.size() == i) + break; + + if (!isUnsignedInteger(arguments[i]) && !arguments[i]->onlyNull()) + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the {} argument of function {}, expected unsigned integer or Null", + i + 1, + arguments[i]->getName(), + getName()); + } } - if (arguments.size() > 1 && !isUnsignedInteger(arguments[1]) && !arguments[1]->onlyNull()) + for (size_t i = 2; i != 7; ++i) { - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the second argument of function {}, expected unsigned integer or Null", - arguments[1]->getName(), - getName()); - } + if (arguments.size() <= i) + break; - if (arguments.size() > 2 && !isUInt8(arguments[2])) - { - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the third argument of function {}, expected UInt8", - arguments[2]->getName(), - getName()); - } - - if (arguments.size() > 3 && !isUInt8(arguments[3])) - { - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the fourth argument of function {}, expected UInt8", - arguments[3]->getName(), - getName()); + if (!isUInt8(arguments[i])) + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the {} argument of function {}, expected UInt8", + i + 1, + arguments[i]->getName(), + getName()); + } } return std::make_shared(); @@ -216,8 +229,7 @@ public: if (!arguments.empty() && !arguments[0].column->onlyNull()) { - const auto & first_arg = arguments[0]; - number_of_columns = first_arg.column->getUInt(0); + number_of_columns = arguments[0].column->getUInt(0); if (number_of_columns > MAX_NUMBER_OF_COLUMNS) throw Exception( ErrorCodes::BAD_ARGUMENTS, @@ -227,36 +239,39 @@ public: } if (arguments.size() > 1 && !arguments[1].column->onlyNull()) - { - const auto & second_arg = arguments[1]; - seed = second_arg.column->getUInt(0); - } + seed = arguments[1].column->getUInt(0); bool allow_big_numbers = true; if (arguments.size() > 2) - { - const auto & third_arg = arguments[2]; - allow_big_numbers = third_arg.column->getBool(0); - } + allow_big_numbers = arguments[2].column->getBool(0); bool allow_enums = true; if (arguments.size() > 3) - { - const auto & fourth_arg = arguments[3]; - allow_enums = fourth_arg.column->getBool(0); - } + allow_enums = arguments[3].column->getBool(0); + + bool allow_decimals = true; + if (arguments.size() > 4) + allow_decimals = arguments[4].column->getBool(0); + + bool allow_ip = true; + if (arguments.size() > 5) + allow_ip = arguments[5].column->getBool(0); + + bool only_string_map_key = false; + if (arguments.size() > 6) + only_string_map_key = arguments[6].column->getBool(0); pcg64 rng(seed); if (number_of_columns == 0) number_of_columns = generateNumberOfColumns(rng); auto col_res = ColumnString::create(); - String generated_structure = ""; + String generated_structure; for (size_t i = 0; i != number_of_columns; ++i) { if (i != 0) generated_structure += ", "; - auto type = generateRandomType(rng, allow_big_numbers, allow_enums); + auto type = generateRandomType(rng, allow_big_numbers, allow_enums, allow_decimals, allow_ip, only_string_map_key); generated_structure += "c" + std::to_string(i + 1) + " " + type; } col_res->insert(generated_structure); @@ -269,26 +284,37 @@ private: { return rng() % MAX_NUMBER_OF_COLUMNS + 1; } - - String generateRandomType(pcg64 & rng, bool allow_big_numbers, bool allow_enums) const + + /// Helper struct to call generateRandomTypeImpl with lots of bool template arguments without writing big if/else over all bool variables. + template + struct Dispatcher { - if (allow_big_numbers) + static auto call(const FunctionGenerateRandomStructure * f, pcg64 & rng) { - if (allow_enums) - return generateRandomTypeImpl(rng); - return generateRandomTypeImpl(rng); + return f->generateRandomTypeImpl(rng); } - if (allow_enums) - return generateRandomTypeImpl(rng); - return generateRandomTypeImpl(rng); - } - + template + static auto call(const FunctionGenerateRandomStructure * f, pcg64 & rng, bool b, Args1... ar1) + { + if (b) + return Dispatcher::call(f, rng, ar1...); + else + return Dispatcher::call(f, rng, ar1...); + } - template + friend FunctionGenerateRandomStructure; + }; + + String generateRandomType(pcg64 & rng, bool allow_big_numbers, bool allow_enums, bool allow_decimals, bool allow_ip, bool allow_only_string_map_keys) const + { + return Dispatcher<>::call(this, rng, allow_big_numbers, allow_enums, allow_decimals, allow_ip, allow_only_string_map_keys, true); + } + + template String generateRandomTypeImpl(pcg64 & rng, size_t depth = 0) const { - constexpr auto all_types = getAllTypes(); + constexpr auto all_types = getAllTypes(); auto type = all_types[rng() % all_types.size()]; switch (type) @@ -312,11 +338,21 @@ private: case Type::LowCardinality: return "LowCardinality(" + generateLowCardinalityNestedType(rng) + ")"; case Type::Nullable: - return "Nullable(" + generateRandomTypeImpl(rng, depth + 1) + ")"; + { + auto nested_type = generateRandomTypeImpl(rng, depth + 1); + return "Nullable(" + nested_type + ")"; + } case Type::Array: - return "Array(" + generateRandomTypeImpl(rng, depth + 1) + ")"; + { + auto nested_type = generateRandomTypeImpl(rng, depth + 1); + return "Array(" + nested_type + ")"; + } case Type::Map: - return "Map(" + generateMapKeyType(rng) + ", " + generateRandomTypeImpl(rng, depth + 1) + ")"; + { + auto key_type = generateMapKeyType(rng); + auto value_type = generateRandomTypeImpl(rng, depth + 1); + return "Map(" + key_type + ", " + value_type + ")"; + } case Type::Tuple: { size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1; @@ -328,7 +364,7 @@ private: tuple_type += ", "; if (named_tuple) tuple_type += "e" + std::to_string(i + 1) + " "; - tuple_type += generateRandomTypeImpl(rng, depth + 1); + tuple_type += generateRandomTypeImpl(rng, depth + 1); } return tuple_type + ")"; } @@ -340,7 +376,8 @@ private: { if (i != 0) nested_type += ", "; - nested_type += "e" + std::to_string(i + 1) + " " + generateRandomTypeImpl(rng, depth + 1); + auto element_type = generateRandomTypeImpl(rng, depth + 1); + nested_type += "e" + std::to_string(i + 1) + " " + element_type; } return nested_type + ")"; } @@ -349,9 +386,15 @@ private: } } + template String generateMapKeyType(pcg64 & rng) const { - auto type = map_key_types[rng() % map_key_types.size()]; + Type type; + if constexpr (allow_only_string_map_keys) + type = map_key_string_types[rng() % map_key_string_types.size()]; + else + type = map_key_types[rng() % map_key_types.size()]; + if (type == Type::FixedString) return "FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")"; return String(magic_enum::enum_name(type)); @@ -384,33 +427,42 @@ private: return result; } - template + template static constexpr auto getAllTypes() { - constexpr size_t result_size = simple_types.size() + big_number_types.size() * allow_big_numbers + enum_types.size() * allow_enums + complex_types.size() * allow_complex_types; + constexpr size_t big_integer_types_size = big_integer_types.size() * allow_big_numbers; + constexpr size_t enum_types_size = enum_types.size() * allow_enums; + constexpr size_t decimal_types_size = decimal_types.size() * allow_decimals; + constexpr size_t big_decimal_types_size = big_decimal_types.size() * allow_big_numbers * allow_decimals; + constexpr size_t ip_types_size = ip_types.size() * allow_ip; + constexpr size_t complex_types_size = complex_types.size() * allow_complex_types; + + constexpr size_t result_size = simple_types.size() + big_integer_types_size + enum_types_size + decimal_types_size + + big_decimal_types_size + ip_types_size + complex_types_size; std::array result; size_t index = 0; + for (size_t i = 0; i != simple_types.size(); ++i, ++index) result[index] = simple_types[i]; - - if constexpr (allow_big_numbers) - { - for (size_t i = 0; i != big_number_types.size(); ++i, ++index) - result[index] = big_number_types[i]; - } - - if constexpr (allow_enums) - { - for (size_t i = 0; i != enum_types.size(); ++i, ++index) - result[index] = enum_types[i]; - } - - if constexpr (allow_complex_types) - { - for (size_t i = 0; i != complex_types.size(); ++i, ++index) - result[index] = complex_types[i]; - } - + + for (size_t i = 0; i != big_integer_types_size; ++i, ++index) + result[index] = big_integer_types[i]; + + for (size_t i = 0; i != enum_types_size; ++i, ++index) + result[index] = enum_types[i]; + + for (size_t i = 0; i != decimal_types_size; ++i, ++index) + result[index] = decimal_types[i]; + + for (size_t i = 0; i != big_decimal_types_size; ++i, ++index) + result[index] = big_decimal_types[i]; + + for (size_t i = 0; i != ip_types_size; ++i, ++index) + result[index] = ip_types[i]; + + for (size_t i = 0; i != complex_types_size; ++i, ++index) + result[index] = complex_types[i]; + return result; } }; @@ -422,9 +474,14 @@ REGISTER_FUNCTION(GenerateRandomStructure) { R"( Generates a random table structure. -This function takes 4 optional constant arguments: the number of column in the result structure (random by default), -random seed (random by default), flag that indicates if big number types can be used (true by default), -flag that indicates if enum types can be used (true by default). +This function takes 4 optional constant arguments: +1) the number of column in the result structure (random by default) +2) random seed (random by default) +3) flag that indicates if big number types can be used (true by default) +4) flag that indicates if enum types can be used (true by default) +5) flag that indicates if decimal types can be used (true by default) +6) flag that indicates if ip types (IPv4, IPv6) can be used (true by default) +7) flag that indicates if map keys should be only String or FixedString (false by default) The maximum number of columns is 128. The function returns a value of type String. )", @@ -433,7 +490,10 @@ The function returns a value of type String. {"with specified number of arguments", "SELECT generateRandomStructure(10)"}, {"with specified seed", "SELECT generateRandomStructure(10, 42)"}, {"without big number types", "SELECT generateRandomStructure(10, NULL, false)"}, - {"without enum types", "SELECT generateRandomStructure(10, NULL, false, false)"}, + {"without enum types", "SELECT generateRandomStructure(10, NULL, true, false)"}, + {"without decimal types", "SELECT generateRandomStructure(10, NULL, true, true, false)"}, + {"without ip types", "SELECT generateRandomStructure(10, NULL, true, true, true, false)"}, + {"with only string mak key types", "SELECT generateRandomStructure(10, NULL, true, true, true, true, true)"}, }, Documentation::Categories{"Random"} }, diff --git a/tests/queries/0_stateless/02586_generate_random_structure.reference b/tests/queries/0_stateless/02586_generate_random_structure.reference index 181b7f0ec6f..76d89828071 100644 --- a/tests/queries/0_stateless/02586_generate_random_structure.reference +++ b/tests/queries/0_stateless/02586_generate_random_structure.reference @@ -1,7 +1,11 @@ -c1 DateTime64(1), c2 Int16, c3 Map(Int64, Array(Bool)), c4 Decimal256(30), c5 Int128 -c1 Date, c2 Float64, c3 DateTime, c4 Map(Int64, DateTime64(4)), c5 Nested(e1 LowCardinality(String), e2 UInt32, e3 Enum16(\'v0\' = 0, \'v1\' = 1, \'v2\' = 2, \'v3\' = 3, \'v4\' = 4, \'v5\' = 5, \'v6\' = 6, \'v7\' = 7), e4 Float32, e5 Date32, e6 Int64, e7 Decimal64(0), e8 UInt16, e9 Date32, e10 Int64, e11 DateTime64(0)) -c1 Nested(e1 Int64, e2 Int16, e3 Map(Int16, LowCardinality(Nullable(String))), e4 UInt8, e5 Nested(e1 Array(Nullable(Decimal64(12))), e2 DateTime64(1), e3 UInt64, e4 FixedString(61), e5 Decimal64(13), e6 UInt8), e6 Int8), c2 DateTime64(5), c3 IPv4, c4 String, c5 String -c1 DateTime64(1), c2 IPv4, c3 Nullable(Decimal128(37)), c4 UInt128, c5 Date +c1 Date, c2 Bool, c3 Int16, c4 Map(Int64, Array(Bool)), c5 Decimal256(30) +c1 String, c2 Float64, c3 Enum8(\'v0\' = 0, \'v1\' = 1, \'v2\' = 2, \'v3\' = 3, \'v4\' = 4), c4 UInt64, c5 Date +c1 Nested(e1 Int64, e2 Int16, e3 Map(Int16, LowCardinality(Nullable(String))), e4 UInt8, e5 Nested(e1 Array(Nullable(DateTime)), e2 Nullable(Bool), e3 UInt8, e4 UInt64, e5 Decimal64(6), e6 DateTime), e6 LowCardinality(Nullable(String))), c2 Date, c3 Int32, c4 IPv4, c5 Decimal32(8) +c1 Date, c2 UInt16, c3 UInt256, c4 Nullable(IPv4), c5 Nullable(Decimal64(17)) +c1 Array(Int64), c2 Map(String, LowCardinality(String)), c3 Date, c4 Map(Int64, UInt128), c5 UInt8 +c1 Date, c2 UInt16, c3 UInt256, c4 Nullable(Decimal128(37)), c5 DateTime64(8) +c1 Date, c2 Bool, c3 Int16, c4 Map(FixedString(120), Bool), c5 Decimal256(30) +c1 String, c2 Float64, c3 Enum8(\'v0\' = 0, \'v1\' = 1, \'v2\' = 2, \'v3\' = 3, \'v4\' = 4), c4 UInt64, c5 Date String Const(String) -2106-02-02 121 17265 +1977-07-28 true 5389 diff --git a/tests/queries/0_stateless/02586_generate_random_structure.sql b/tests/queries/0_stateless/02586_generate_random_structure.sql index b524f6a5ff1..061fbc24219 100644 --- a/tests/queries/0_stateless/02586_generate_random_structure.sql +++ b/tests/queries/0_stateless/02586_generate_random_structure.sql @@ -2,16 +2,28 @@ select generateRandomStructure(5, 42); select generateRandomStructure(5, 42, false); select generateRandomStructure(5, 42, false, false); select generateRandomStructure(5, 42, true, false); +select generateRandomStructure(5, 42, true, true, false); +select generateRandomStructure(5, 42, true, true, true, false); +select generateRandomStructure(5, 42, true, true, true, true, true); +select generateRandomStructure(5, 42, false, true, true); select toTypeName(generateRandomStructure(5, 42)); select toColumnTypeName(generateRandomStructure(5, 42)); -SELECT * FROM generateRandom(generateRandomStructure(3, 24), 24) LIMIT 1; +SELECT * FROM generateRandom(generateRandomStructure(3, 42), 42) LIMIT 1; -select generateRandomStructure(5, 42, false, false, 42); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +select generateRandomStructure(5, 42, false, false, false, false, true, 42); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} select generateRandomStructure('5'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} select generateRandomStructure(5, '42'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} select generateRandomStructure(5, 42, 'false'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} select generateRandomStructure(5, 42, false, 'false'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} +select generateRandomStructure(5, 42, false, false, 'false'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} +select generateRandomStructure(5, 42, false, false, false, 'false'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} +select generateRandomStructure(5, 42, false, false, false, false, 'false'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} select generateRandomStructure(materialize(5), 42, false, false); -- {serverError ILLEGAL_COLUMN} select generateRandomStructure(5, materialize(42), false, false); -- {serverError ILLEGAL_COLUMN} select generateRandomStructure(5, 42, materialize(false), false); -- {serverError ILLEGAL_COLUMN} select generateRandomStructure(5, 42, false, materialize(false)); -- {serverError ILLEGAL_COLUMN} +select generateRandomStructure(5, 42, false, false, materialize(false)); -- {serverError ILLEGAL_COLUMN} +select generateRandomStructure(5, 42, false, false, false, materialize(false)); -- {serverError ILLEGAL_COLUMN} +select generateRandomStructure(5, 42, false, false, false, materialize(false)); -- {serverError ILLEGAL_COLUMN} +select generateRandomStructure(5, 42, false, false, false, false, materialize(false)); -- {serverError ILLEGAL_COLUMN} + From c93202cca4cec2e83c51cb6b3cb56dc820965caa Mon Sep 17 00:00:00 2001 From: pufit Date: Thu, 9 Mar 2023 21:23:57 -0500 Subject: [PATCH 0008/1072] Keeper Client MVP --- programs/CMakeLists.txt | 21 +++ programs/config_tools.h.in | 1 + programs/keeper-client/CMakeLists.txt | 9 + programs/keeper-client/KeeperClient.cpp | 224 ++++++++++++++++++++++++ programs/keeper-client/KeeperClient.h | 44 +++++ programs/main.cpp | 6 + src/Client/ClientBase.cpp | 8 - src/Client/ClientBase.h | 8 + 8 files changed, 313 insertions(+), 8 deletions(-) create mode 100644 programs/keeper-client/CMakeLists.txt create mode 100644 programs/keeper-client/KeeperClient.cpp create mode 100644 programs/keeper-client/KeeperClient.h diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index 47017a94cb5..c00d1f5349f 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -55,6 +55,8 @@ option (ENABLE_CLICKHOUSE_KEEPER "ClickHouse alternative to ZooKeeper" ${ENABLE_ option (ENABLE_CLICKHOUSE_KEEPER_CONVERTER "Util allows to convert ZooKeeper logs and snapshots into clickhouse-keeper snapshot" ${ENABLE_CLICKHOUSE_ALL}) +option (ENABLE_CLICKHOUSE_KEEPER_CLIENT "ClickHouse Keeper Client" ${ENABLE_CLICKHOUSE_ALL}) + option (ENABLE_CLICKHOUSE_SU "A tool similar to 'su'" ${ENABLE_CLICKHOUSE_ALL}) option (ENABLE_CLICKHOUSE_DISKS "A tool to manage disks" ${ENABLE_CLICKHOUSE_ALL}) @@ -169,6 +171,13 @@ else() message(STATUS "ClickHouse keeper-converter mode: OFF") endif() +if (ENABLE_CLICKHOUSE_KEEPER_CLIENT) + message(STATUS "ClickHouse keeper-client mode: ON") +else() + message(STATUS "ClickHouse keeper-client mode: OFF") +endif() + + if (ENABLE_CLICKHOUSE_DISKS) message(STATUS "Clickhouse disks mode: ON") else() @@ -237,6 +246,10 @@ if (ENABLE_CLICKHOUSE_KEEPER_CONVERTER) add_subdirectory (keeper-converter) endif() +if (ENABLE_CLICKHOUSE_KEEPER_CLIENT) + add_subdirectory (keeper-client) +endif() + if (ENABLE_CLICKHOUSE_ODBC_BRIDGE) add_subdirectory (odbc-bridge) endif () @@ -301,6 +314,9 @@ endif() if (ENABLE_CLICKHOUSE_KEEPER_CONVERTER) clickhouse_target_link_split_lib(clickhouse keeper-converter) endif() +if (ENABLE_CLICKHOUSE_KEEPER_CLIENT) + clickhouse_target_link_split_lib(clickhouse keeper-client) +endif() if (ENABLE_CLICKHOUSE_INSTALL) clickhouse_target_link_split_lib(clickhouse install) endif () @@ -392,6 +408,11 @@ if (ENABLE_CLICKHOUSE_KEEPER_CONVERTER) install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-keeper-converter" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) list(APPEND CLICKHOUSE_BUNDLE clickhouse-keeper-converter) endif () +if (ENABLE_CLICKHOUSE_KEEPER_CLIENT) + add_custom_target (clickhouse-keeper-client ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-keeper-client DEPENDS clickhouse) + install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-keeper-client" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) + list(APPEND CLICKHOUSE_BUNDLE clickhouse-keeper-client) +endif () if (ENABLE_CLICKHOUSE_DISKS) add_custom_target (clickhouse-disks ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-disks DEPENDS clickhouse) install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-disks" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) diff --git a/programs/config_tools.h.in b/programs/config_tools.h.in index 30444e8c84e..65ef3ca762b 100644 --- a/programs/config_tools.h.in +++ b/programs/config_tools.h.in @@ -17,6 +17,7 @@ #cmakedefine01 ENABLE_CLICKHOUSE_ODBC_BRIDGE #cmakedefine01 ENABLE_CLICKHOUSE_LIBRARY_BRIDGE #cmakedefine01 ENABLE_CLICKHOUSE_KEEPER +#cmakedefine01 ENABLE_CLICKHOUSE_KEEPER_CLIENT #cmakedefine01 ENABLE_CLICKHOUSE_KEEPER_CONVERTER #cmakedefine01 ENABLE_CLICKHOUSE_STATIC_FILES_DISK_UPLOADER #cmakedefine01 ENABLE_CLICKHOUSE_SU diff --git a/programs/keeper-client/CMakeLists.txt b/programs/keeper-client/CMakeLists.txt new file mode 100644 index 00000000000..06055d6d820 --- /dev/null +++ b/programs/keeper-client/CMakeLists.txt @@ -0,0 +1,9 @@ +set (CLICKHOUSE_KEEPER_CLIENT_SOURCES KeeperClient.cpp) + +set (CLICKHOUSE_KEEPER_CLIENT_LINK + PRIVATE + boost::program_options + dbms +) + +clickhouse_program_add(keeper-client) diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp new file mode 100644 index 00000000000..05011f064fb --- /dev/null +++ b/programs/keeper-client/KeeperClient.cpp @@ -0,0 +1,224 @@ +#include "KeeperClient.h" +#include +#include +#include +#include +#include +#include + + +namespace po = boost::program_options; +namespace fs = std::filesystem; + +namespace DB +{ + +String KeeperClient::getAbsolutePath(const String & relative) +{ + String result; + if (relative.starts_with('/')) + result = fs::weakly_canonical(relative); + else + result = fs::weakly_canonical(cwd / relative); + + if (result.ends_with('/') && result.size() > 1) + result.pop_back(); + + return result; +} + +void KeeperClient::loadCommands(std::vector> && new_commands) +{ + for (auto & [name, args_count, callback] : new_commands) { + commands.insert({{name, args_count}, callback}); + suggest.addWords({name}); + } +} + +void KeeperClient::defineOptions(Poco::Util::OptionSet & options) +{ + Poco::Util::Application::defineOptions(options); + + options.addOption( + Poco::Util::Option("help", "h", "show help and exit") + .binding("help")); + + options.addOption( + Poco::Util::Option("connection-timeout", "", "set connection timeout in seconds. default 10s.") + .argument("connection-timeout") + .binding("connection-timeout")); + + options.addOption( + Poco::Util::Option("session-timeout", "", "set session timeout in seconds. default 10s.") + .argument("session-timeout") + .binding("session-timeout")); + + options.addOption( + Poco::Util::Option("operation-timeout", "", "set operation timeout in seconds. default 10s.") + .argument("operation-timeout") + .binding("operation-timeout")); + + options.addOption( + Poco::Util::Option("history-file", "", "set path of history file. default `~/.keeper-client-history`") + .argument("history-file") + .binding("history-file")); +} + +void KeeperClient::initialize(Poco::Util::Application & /* self */) +{ + loadCommands({ + {"set", 2, [](KeeperClient * client, const std::vector & args) { + client->zookeeper->set(client->getAbsolutePath(args[1]), args[2]); + }}, + + {"create", 2, [](KeeperClient * client, const std::vector & args) { + client->zookeeper->create(client->getAbsolutePath(args[1]), args[2], zkutil::CreateMode::Persistent); + }}, + + {"get", 1, [](KeeperClient * client, const std::vector & args) { + std::cout << client->zookeeper->get(client->getAbsolutePath(args[1])) << "\n"; + }}, + + {"ls", 0, [](KeeperClient * client, const std::vector & /* args */) { + auto children = client->zookeeper->getChildren(client->cwd); + for (auto & child : children) + std::cout << child << " "; + std::cout << "\n"; + }}, + + {"ls", 1, [](KeeperClient * client, const std::vector & args) { + auto children = client->zookeeper->getChildren(client->getAbsolutePath(args[1])); + for (auto & child : children) + std::cout << child << " "; + std::cout << "\n"; + }}, + + {"cd", 0, [](KeeperClient * /* client */, const std::vector & /* args */) { + }}, + + {"cd", 1, [](KeeperClient * client, const std::vector & args) { + auto new_path = client->getAbsolutePath(args[1]); + if (!client->zookeeper->exists(new_path)) + std::cerr << "Path " << new_path << " does not exists\n"; + else + client->cwd = new_path; + }}, + + {"rm", 1, [](KeeperClient * client, const std::vector & args) { + client->zookeeper->remove(client->getAbsolutePath(args[1])); + }}, + + {"rmr", 1, [](KeeperClient * client, const std::vector & args) { + client->zookeeper->removeRecursive(client->getAbsolutePath(args[1])); + }}, + }); + + String home_path; + const char * home_path_cstr = getenv("HOME"); // NOLINT(concurrency-mt-unsafe) + if (home_path_cstr) + home_path = home_path_cstr; + + if (config().has("history-file")) + history_file = config().getString("history-file"); + else + history_file = home_path + "/.keeper-client-history"; + + if (!history_file.empty() && !fs::exists(history_file)) + { + try + { + FS::createFile(history_file); + } + catch (const ErrnoException & e) + { + if (e.getErrno() != EEXIST) + throw; + } + } + + EventNotifier::init(); +} + +bool KeeperClient::processQueryText(const String & text) +{ + if (exit_strings.find(text) != exit_strings.end()) + return false; + + std::vector tokens; + boost::algorithm::split(tokens, text, boost::is_any_of(" ")); + + try + { + auto callback = commands.find({tokens[0], tokens.size() - 1}); + if (callback == commands.end()) + std::cerr << "No command found with name " << tokens[0] << " and args count " << tokens.size() - 1 << "\n"; + else + callback->second(this, tokens); + } + catch (Coordination::Exception & err) + { + std::cerr << err.message() << "\n"; + } + return true; +} + +void KeeperClient::runInteractive() +{ + + LineReader::Patterns query_extenders = {"\\"}; + LineReader::Patterns query_delimiters = {}; + + ReplxxLineReader lr(suggest, history_file, false, query_extenders, query_delimiters, {}); + lr.enableBracketedPaste(); + + while (true) + { + auto input = lr.readLine( cwd.string() + " :) ", ":-] "); + if (input.empty()) + break; + + if (!processQueryText(input)) + break; + } +} + +int KeeperClient::main(const std::vector & args) +{ + zkutil::ZooKeeperArgs zk_args(args[0]); + zk_args.connection_timeout_ms = config().getInt("connection-timeout", 10) * 1000; + zk_args.session_timeout_ms = config().getInt("session-timeout", 10) * 1000; + zk_args.operation_timeout_ms = config().getInt("operation-timeout", 10) * 1000; + zookeeper = std::make_unique(zk_args); + + runInteractive(); + + return 0; +} + +} + + +int mainEntryClickHouseKeeperClient(int argc, char ** argv) +{ + try + { + DB::KeeperClient client; + client.init(argc, argv); + return client.run(); + } + catch (const DB::Exception & e) + { + std::cerr << DB::getExceptionMessage(e, false) << std::endl; + return 1; + } + catch (const boost::program_options::error & e) + { + std::cerr << "Bad arguments: " << e.what() << std::endl; + return DB::ErrorCodes::BAD_ARGUMENTS; + } + catch (...) + { + std::cerr << DB::getCurrentExceptionMessage(true) << std::endl; + return 1; + } +} diff --git a/programs/keeper-client/KeeperClient.h b/programs/keeper-client/KeeperClient.h new file mode 100644 index 00000000000..8d96ade7659 --- /dev/null +++ b/programs/keeper-client/KeeperClient.h @@ -0,0 +1,44 @@ +#pragma once + + +#include +#include +#include +#include + + +namespace DB +{ + +class KeeperClient; + +class KeeperClient: public Poco::Util::Application +{ +public: + using Callback = std::function &)>; + + KeeperClient() = default; + + void initialize(Poco::Util::Application & self) override; + + int main(const std::vector & args) override; + + void defineOptions(Poco::Util::OptionSet & options) override; + +protected: + void runInteractive(); + void loadCommands(std::vector> &&); + bool processQueryText(const String & text); + + String getAbsolutePath(const String & relative); + + std::map, Callback> commands; + + String history_file; + LineReader::Suggest suggest; + + zkutil::ZooKeeperPtr zookeeper; + std::filesystem::path cwd = "/"; +}; + +} diff --git a/programs/main.cpp b/programs/main.cpp index 83e64b8c932..9a3ad47a86e 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -62,6 +62,9 @@ int mainEntryClickHouseKeeper(int argc, char ** argv); #if ENABLE_CLICKHOUSE_KEEPER_CONVERTER int mainEntryClickHouseKeeperConverter(int argc, char ** argv); #endif +#if ENABLE_CLICKHOUSE_KEEPER_CLIENT +int mainEntryClickHouseKeeperClient(int argc, char ** argv); +#endif #if ENABLE_CLICKHOUSE_STATIC_FILES_DISK_UPLOADER int mainEntryClickHouseStaticFilesDiskUploader(int argc, char ** argv); #endif @@ -133,6 +136,9 @@ std::pair clickhouse_applications[] = #if ENABLE_CLICKHOUSE_KEEPER_CONVERTER {"keeper-converter", mainEntryClickHouseKeeperConverter}, #endif +#if ENABLE_CLICKHOUSE_KEEPER_CLIENT + {"keeper-client", mainEntryClickHouseKeeperClient}, +#endif #if ENABLE_CLICKHOUSE_INSTALL {"install", mainEntryClickHouseInstall}, {"start", mainEntryClickHouseStart}, diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 120d273aa62..c931fb426ec 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -86,14 +86,6 @@ namespace CurrentMetrics namespace DB { -static const NameSet exit_strings -{ - "exit", "quit", "logout", "учше", "йгше", "дщпщге", - "exit;", "quit;", "logout;", "учшеж", "йгшеж", "дщпщгеж", - "q", "й", "\\q", "\\Q", "\\й", "\\Й", ":q", "Жй" -}; - - namespace ErrorCodes { extern const int BAD_ARGUMENTS; diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index 5926f73f51a..566c5aefa04 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -24,6 +24,14 @@ namespace po = boost::program_options; namespace DB { + +static const NameSet exit_strings +{ + "exit", "quit", "logout", "учше", "йгше", "дщпщге", + "exit;", "quit;", "logout;", "учшеж", "йгшеж", "дщпщгеж", + "q", "й", "\\q", "\\Q", "\\й", "\\Й", ":q", "Жй" +}; + namespace ErrorCodes { extern const int NOT_IMPLEMENTED; From 9051ddb1743730f4504abdf2209c33a0b03de9b5 Mon Sep 17 00:00:00 2001 From: pufit Date: Thu, 9 Mar 2023 21:45:58 -0500 Subject: [PATCH 0009/1072] Style fix --- programs/keeper-client/KeeperClient.cpp | 37 +++++++++++++++++-------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index 05011f064fb..0a4cdd4286b 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -13,6 +13,11 @@ namespace fs = std::filesystem; namespace DB { +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + String KeeperClient::getAbsolutePath(const String & relative) { String result; @@ -29,7 +34,8 @@ String KeeperClient::getAbsolutePath(const String & relative) void KeeperClient::loadCommands(std::vector> && new_commands) { - for (auto & [name, args_count, callback] : new_commands) { + for (auto & [name, args_count, callback] : new_commands) + { commands.insert({{name, args_count}, callback}); suggest.addWords({name}); } @@ -67,36 +73,43 @@ void KeeperClient::defineOptions(Poco::Util::OptionSet & options) void KeeperClient::initialize(Poco::Util::Application & /* self */) { loadCommands({ - {"set", 2, [](KeeperClient * client, const std::vector & args) { + {"set", 2, [](KeeperClient * client, const std::vector & args) + { client->zookeeper->set(client->getAbsolutePath(args[1]), args[2]); }}, - {"create", 2, [](KeeperClient * client, const std::vector & args) { + {"create", 2, [](KeeperClient * client, const std::vector & args) + { client->zookeeper->create(client->getAbsolutePath(args[1]), args[2], zkutil::CreateMode::Persistent); }}, - {"get", 1, [](KeeperClient * client, const std::vector & args) { + {"get", 1, [](KeeperClient * client, const std::vector & args) + { std::cout << client->zookeeper->get(client->getAbsolutePath(args[1])) << "\n"; }}, - {"ls", 0, [](KeeperClient * client, const std::vector & /* args */) { + {"ls", 0, [](KeeperClient * client, const std::vector & /* args */) + { auto children = client->zookeeper->getChildren(client->cwd); for (auto & child : children) std::cout << child << " "; std::cout << "\n"; }}, - {"ls", 1, [](KeeperClient * client, const std::vector & args) { + {"ls", 1, [](KeeperClient * client, const std::vector & args) + { auto children = client->zookeeper->getChildren(client->getAbsolutePath(args[1])); for (auto & child : children) std::cout << child << " "; std::cout << "\n"; }}, - {"cd", 0, [](KeeperClient * /* client */, const std::vector & /* args */) { + {"cd", 0, [](KeeperClient * /* client */, const std::vector & /* args */) + { }}, - {"cd", 1, [](KeeperClient * client, const std::vector & args) { + {"cd", 1, [](KeeperClient * client, const std::vector & args) + { auto new_path = client->getAbsolutePath(args[1]); if (!client->zookeeper->exists(new_path)) std::cerr << "Path " << new_path << " does not exists\n"; @@ -104,11 +117,13 @@ void KeeperClient::initialize(Poco::Util::Application & /* self */) client->cwd = new_path; }}, - {"rm", 1, [](KeeperClient * client, const std::vector & args) { + {"rm", 1, [](KeeperClient * client, const std::vector & args) + { client->zookeeper->remove(client->getAbsolutePath(args[1])); }}, - {"rmr", 1, [](KeeperClient * client, const std::vector & args) { + {"rmr", 1, [](KeeperClient * client, const std::vector & args) + { client->zookeeper->removeRecursive(client->getAbsolutePath(args[1])); }}, }); @@ -173,7 +188,7 @@ void KeeperClient::runInteractive() while (true) { - auto input = lr.readLine( cwd.string() + " :) ", ":-] "); + auto input = lr.readLine(cwd.string() + " :) ", ":-] "); if (input.empty()) break; From dfea87d24888eb3d223ab5c020ff5a3cdd029409 Mon Sep 17 00:00:00 2001 From: pufit Date: Fri, 10 Mar 2023 23:12:16 -0500 Subject: [PATCH 0010/1072] Added confirmation for rmr operation. Implemented support for four-letter-word commands. --- programs/keeper-client/KeeperClient.cpp | 73 ++++++++++++++++++++++--- programs/keeper-client/KeeperClient.h | 13 ++++- 2 files changed, 78 insertions(+), 8 deletions(-) diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index 0a4cdd4286b..752c44bd8f4 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -13,11 +13,47 @@ namespace fs = std::filesystem; namespace DB { +static const NameSet four_letter_word_commands +{ + "ruok", "mntr", "srvr", "stat", "srst", "conf", + "cons", "crst", "envi", "dirs", "isro", "wchs", + "wchc", "wchp", "dump", "csnp", "lgif", "rqld", +}; + namespace ErrorCodes { extern const int BAD_ARGUMENTS; } +String KeeperClient::executeFourLetterCommand(const String & command) +{ + // We need create new socket every time because ZooKeeper forcefully shut down connection after four-letter-word command. + Poco::Net::StreamSocket socket; + socket.connect(Poco::Net::SocketAddress{zk_args.hosts[0]}, zk_args.connection_timeout_ms * 1000); + + socket.setReceiveTimeout(zk_args.operation_timeout_ms * 1000); + socket.setSendTimeout(zk_args.operation_timeout_ms * 1000); + socket.setNoDelay(true); + + ReadBufferFromPocoSocket in(socket); + WriteBufferFromPocoSocket out(socket); + + out.write(command.data(), command.size()); + out.next(); + + String result; + readStringUntilEOF(result, in); + in.next(); + return result; +} + +void KeeperClient::askConfirmation(const String & prompt, std::function && callback) +{ + std::cout << prompt << " Continue?\n"; + need_confirmation = true; + confirmation_callback = callback; +} + String KeeperClient::getAbsolutePath(const String & relative) { String result; @@ -124,7 +160,9 @@ void KeeperClient::initialize(Poco::Util::Application & /* self */) {"rmr", 1, [](KeeperClient * client, const std::vector & args) { - client->zookeeper->removeRecursive(client->getAbsolutePath(args[1])); + String path = client->getAbsolutePath(args[1]); + client->askConfirmation("You are going to recursively delete path " + path, + [client, path]{ client->zookeeper->removeRecursive(path); }); }}, }); @@ -164,11 +202,26 @@ bool KeeperClient::processQueryText(const String & text) try { - auto callback = commands.find({tokens[0], tokens.size() - 1}); - if (callback == commands.end()) - std::cerr << "No command found with name " << tokens[0] << " and args count " << tokens.size() - 1 << "\n"; + if (need_confirmation) + { + if (tokens.size() == 1 && (tokens[0] == "y" || tokens[0] == "Y")) + { + need_confirmation = false; + confirmation_callback(); + } + + need_confirmation = false; + } + else if (tokens.size() == 1 && tokens[0].size() == 4 && four_letter_word_commands.find(tokens[0]) != four_letter_word_commands.end()) + std::cout << executeFourLetterCommand(tokens[0]) << "\n"; else - callback->second(this, tokens); + { + auto callback = commands.find({tokens[0], tokens.size() - 1}); + if (callback == commands.end()) + std::cerr << "No command found with name " << tokens[0] << " and args count " << tokens.size() - 1 << "\n"; + else + callback->second(this, tokens); + } } catch (Coordination::Exception & err) { @@ -188,7 +241,13 @@ void KeeperClient::runInteractive() while (true) { - auto input = lr.readLine(cwd.string() + " :) ", ":-] "); + String prompt; + if (need_confirmation) + prompt = "[y/n] "; + else + prompt = cwd.string() + " :) "; + + auto input = lr.readLine(prompt, ":-] "); if (input.empty()) break; @@ -199,7 +258,7 @@ void KeeperClient::runInteractive() int KeeperClient::main(const std::vector & args) { - zkutil::ZooKeeperArgs zk_args(args[0]); + zk_args.hosts = {args[0]}; zk_args.connection_timeout_ms = config().getInt("connection-timeout", 10) * 1000; zk_args.session_timeout_ms = config().getInt("session-timeout", 10) * 1000; zk_args.operation_timeout_ms = config().getInt("operation-timeout", 10) * 1000; diff --git a/programs/keeper-client/KeeperClient.h b/programs/keeper-client/KeeperClient.h index 8d96ade7659..0634d3e4b37 100644 --- a/programs/keeper-client/KeeperClient.h +++ b/programs/keeper-client/KeeperClient.h @@ -3,6 +3,9 @@ #include #include +#include +#include +#include #include #include @@ -27,10 +30,13 @@ public: protected: void runInteractive(); - void loadCommands(std::vector> &&); + void loadCommands(std::vector> && callback); bool processQueryText(const String & text); + String executeFourLetterCommand(const String & command); + String getAbsolutePath(const String & relative); + void askConfirmation(const String & prompt, std::function && callback); std::map, Callback> commands; @@ -38,7 +44,12 @@ protected: LineReader::Suggest suggest; zkutil::ZooKeeperPtr zookeeper; + zkutil::ZooKeeperArgs zk_args; + std::filesystem::path cwd = "/"; + + bool need_confirmation = false; + std::function confirmation_callback; }; } From 72769d468ea55f419286caf417ed985af9e4069b Mon Sep 17 00:00:00 2001 From: pufit Date: Fri, 10 Mar 2023 23:15:15 -0500 Subject: [PATCH 0011/1072] comment fix --- programs/keeper-client/KeeperClient.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index 752c44bd8f4..43a9527a3ca 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -27,7 +27,7 @@ namespace ErrorCodes String KeeperClient::executeFourLetterCommand(const String & command) { - // We need create new socket every time because ZooKeeper forcefully shut down connection after four-letter-word command. + /// We need to create a new socket every time because ZooKeeper forcefully shuts down the connection after a four-letter-word command. Poco::Net::StreamSocket socket; socket.connect(Poco::Net::SocketAddress{zk_args.hosts[0]}, zk_args.connection_timeout_ms * 1000); From 18fada7028c7997f25814201ce7c89c05efb9e3d Mon Sep 17 00:00:00 2001 From: pufit Date: Sun, 12 Mar 2023 12:54:42 -0400 Subject: [PATCH 0012/1072] fix duplication --- programs/keeper-client/KeeperClient.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index 43a9527a3ca..74eae042b4c 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -204,13 +204,9 @@ bool KeeperClient::processQueryText(const String & text) { if (need_confirmation) { - if (tokens.size() == 1 && (tokens[0] == "y" || tokens[0] == "Y")) - { - need_confirmation = false; - confirmation_callback(); - } - need_confirmation = false; + if (tokens.size() == 1 && (tokens[0] == "y" || tokens[0] == "Y")) + confirmation_callback(); } else if (tokens.size() == 1 && tokens[0].size() == 4 && four_letter_word_commands.find(tokens[0]) != four_letter_word_commands.end()) std::cout << executeFourLetterCommand(tokens[0]) << "\n"; From 752eab501ce9da992f7f4a1dc3600521c7c65723 Mon Sep 17 00:00:00 2001 From: pufit Date: Tue, 14 Mar 2023 14:24:36 -0400 Subject: [PATCH 0013/1072] default host to connect --- programs/keeper-client/KeeperClient.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index 74eae042b4c..1f68cbd05bf 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -254,7 +254,11 @@ void KeeperClient::runInteractive() int KeeperClient::main(const std::vector & args) { - zk_args.hosts = {args[0]}; + if (args.empty()) + zk_args.hosts = {"localhost:2181"}; + else + zk_args.hosts = {args[0]}; + zk_args.connection_timeout_ms = config().getInt("connection-timeout", 10) * 1000; zk_args.session_timeout_ms = config().getInt("session-timeout", 10) * 1000; zk_args.operation_timeout_ms = config().getInt("operation-timeout", 10) * 1000; From 7adc442fedba0617f322cdecdaee7b21904bf51a Mon Sep 17 00:00:00 2001 From: pufit Date: Tue, 14 Mar 2023 15:32:48 -0400 Subject: [PATCH 0014/1072] support run in non-interactive mode --- programs/keeper-client/KeeperClient.cpp | 35 ++++++++++++++++++++++--- programs/keeper-client/KeeperClient.h | 1 + 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index 1f68cbd05bf..9ed60dd0d4b 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -70,11 +70,15 @@ String KeeperClient::getAbsolutePath(const String & relative) void KeeperClient::loadCommands(std::vector> && new_commands) { - for (auto & [name, args_count, callback] : new_commands) + for (const auto & [name, args_count, callback] : new_commands) { commands.insert({{name, args_count}, callback}); suggest.addWords({name}); } + + for (const auto & command : four_letter_word_commands) { + suggest.addWords({command}); + } } void KeeperClient::defineOptions(Poco::Util::OptionSet & options) @@ -85,6 +89,11 @@ void KeeperClient::defineOptions(Poco::Util::OptionSet & options) Poco::Util::Option("help", "h", "show help and exit") .binding("help")); + options.addOption( + Poco::Util::Option("query", "q", "will execute given query, then exit.") + .argument("query") + .binding("query")); + options.addOption( Poco::Util::Option("connection-timeout", "", "set connection timeout in seconds. default 10s.") .argument("connection-timeout") @@ -192,6 +201,18 @@ void KeeperClient::initialize(Poco::Util::Application & /* self */) EventNotifier::init(); } +void KeeperClient::executeQuery(const String & query) +{ + std::vector queries; + boost::algorithm::split(queries, query, boost::is_any_of(";")); + + for (const auto & query_text : queries) + { + if (!query_text.empty()) + processQueryText(query_text); + } +} + bool KeeperClient::processQueryText(const String & text) { if (exit_strings.find(text) != exit_strings.end()) @@ -214,7 +235,12 @@ bool KeeperClient::processQueryText(const String & text) { auto callback = commands.find({tokens[0], tokens.size() - 1}); if (callback == commands.end()) - std::cerr << "No command found with name " << tokens[0] << " and args count " << tokens.size() - 1 << "\n"; + { + if (tokens[0].size() == 4 && tokens.size() == 1) /// Treat it like unrecognized four-letter command + std::cout << executeFourLetterCommand(tokens[0]) << "\n"; + else + std::cerr << "No command found with name " << tokens[0] << " and args count " << tokens.size() - 1 << "\n"; + } else callback->second(this, tokens); } @@ -264,7 +290,10 @@ int KeeperClient::main(const std::vector & args) zk_args.operation_timeout_ms = config().getInt("operation-timeout", 10) * 1000; zookeeper = std::make_unique(zk_args); - runInteractive(); + if (config().has("query")) + executeQuery(config().getString("query")); + else + runInteractive(); return 0; } diff --git a/programs/keeper-client/KeeperClient.h b/programs/keeper-client/KeeperClient.h index 0634d3e4b37..10099b06021 100644 --- a/programs/keeper-client/KeeperClient.h +++ b/programs/keeper-client/KeeperClient.h @@ -32,6 +32,7 @@ protected: void runInteractive(); void loadCommands(std::vector> && callback); bool processQueryText(const String & text); + void executeQuery(const String & query); String executeFourLetterCommand(const String & command); From bfdc2b58b421718550b586fab6806e24a18cc960 Mon Sep 17 00:00:00 2001 From: pufit Date: Tue, 14 Mar 2023 15:34:08 -0400 Subject: [PATCH 0015/1072] style fix --- programs/keeper-client/KeeperClient.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index 9ed60dd0d4b..c427c6fc8ef 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -203,7 +203,7 @@ void KeeperClient::initialize(Poco::Util::Application & /* self */) void KeeperClient::executeQuery(const String & query) { - std::vector queries; + std::vector queries; boost::algorithm::split(queries, query, boost::is_any_of(";")); for (const auto & query_text : queries) @@ -218,7 +218,7 @@ bool KeeperClient::processQueryText(const String & text) if (exit_strings.find(text) != exit_strings.end()) return false; - std::vector tokens; + std::vector tokens; boost::algorithm::split(tokens, text, boost::is_any_of(" ")); try From 7dc6ff02c33b3d84fdaf4ff2f16a74c8b2edbd3e Mon Sep 17 00:00:00 2001 From: pufit Date: Tue, 14 Mar 2023 17:50:09 -0400 Subject: [PATCH 0016/1072] use keeper-client in integration tests --- programs/keeper-client/KeeperClient.cpp | 7 +++++++ tests/integration/helpers/keeper_utils.py | 22 ++++------------------ 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index c427c6fc8ef..10aa4b1dedd 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -113,6 +113,11 @@ void KeeperClient::defineOptions(Poco::Util::OptionSet & options) Poco::Util::Option("history-file", "", "set path of history file. default `~/.keeper-client-history`") .argument("history-file") .binding("history-file")); + + options.addOption( + Poco::Util::Option("log-level", "", "set log level") + .argument("log-level") + .binding("log-level")); } void KeeperClient::initialize(Poco::Util::Application & /* self */) @@ -198,6 +203,8 @@ void KeeperClient::initialize(Poco::Util::Application & /* self */) } } + Poco::Logger::root().setLevel(config().getString("log-level", "error")); + EventNotifier::init(); } diff --git a/tests/integration/helpers/keeper_utils.py b/tests/integration/helpers/keeper_utils.py index 3b909194b63..c6cd9dfa18a 100644 --- a/tests/integration/helpers/keeper_utils.py +++ b/tests/integration/helpers/keeper_utils.py @@ -1,26 +1,12 @@ import socket import time - - -def get_keeper_socket(cluster, node, port=9181): - hosts = cluster.get_instance_ip(node.name) - client = socket.socket() - client.settimeout(10) - client.connect((hosts, port)) - return client +from helper.client import CommandRequest def send_4lw_cmd(cluster, node, cmd="ruok", port=9181): - client = None - try: - client = get_keeper_socket(cluster, node, port) - client.send(cmd.encode()) - data = client.recv(100_000) - data = data.decode() - return data - finally: - if client is not None: - client.close() + return CommandRequest( + ["cluster.server_bin_path", "keeper-client", f"{cluster.get_instance_ip(node.name)}:{port}", "-q", cmd] + ).get_answer() NOT_SERVING_REQUESTS_ERROR_MSG = "This instance is not currently serving requests" From e6d01c617b16117d314dbeea43660ece5c4b54d2 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 14 Mar 2023 22:14:43 +0000 Subject: [PATCH 0017/1072] Automatic style fix --- tests/integration/helpers/keeper_utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/integration/helpers/keeper_utils.py b/tests/integration/helpers/keeper_utils.py index c6cd9dfa18a..f9cc2fb29fb 100644 --- a/tests/integration/helpers/keeper_utils.py +++ b/tests/integration/helpers/keeper_utils.py @@ -5,7 +5,13 @@ from helper.client import CommandRequest def send_4lw_cmd(cluster, node, cmd="ruok", port=9181): return CommandRequest( - ["cluster.server_bin_path", "keeper-client", f"{cluster.get_instance_ip(node.name)}:{port}", "-q", cmd] + [ + "cluster.server_bin_path", + "keeper-client", + f"{cluster.get_instance_ip(node.name)}:{port}", + "-q", + cmd, + ] ).get_answer() From ef9f66e36fed106f59bafaf971a02799fbecb75f Mon Sep 17 00:00:00 2001 From: pufit Date: Tue, 14 Mar 2023 18:30:23 -0400 Subject: [PATCH 0018/1072] style fix --- programs/keeper-client/KeeperClient.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index 10aa4b1dedd..54b5cf1d6cf 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -76,9 +76,8 @@ void KeeperClient::loadCommands(std::vector suggest.addWords({name}); } - for (const auto & command : four_letter_word_commands) { + for (const auto & command : four_letter_word_commands) suggest.addWords({command}); - } } void KeeperClient::defineOptions(Poco::Util::OptionSet & options) From 65f2516079f3a2b53af8224078119fc4062b6ef7 Mon Sep 17 00:00:00 2001 From: pufit Date: Thu, 16 Mar 2023 15:37:06 -0400 Subject: [PATCH 0019/1072] separate integration tests for keeper-client --- tests/integration/helpers/keeper_utils.py | 28 +++++---- .../test_keeper_client/__init__.py | 0 .../configs/keeper_config.xml | 3 + tests/integration/test_keeper_client/test.py | 57 +++++++++++++++++++ 4 files changed, 78 insertions(+), 10 deletions(-) create mode 100644 tests/integration/test_keeper_client/__init__.py create mode 100644 tests/integration/test_keeper_client/configs/keeper_config.xml create mode 100644 tests/integration/test_keeper_client/test.py diff --git a/tests/integration/helpers/keeper_utils.py b/tests/integration/helpers/keeper_utils.py index f9cc2fb29fb..3b909194b63 100644 --- a/tests/integration/helpers/keeper_utils.py +++ b/tests/integration/helpers/keeper_utils.py @@ -1,18 +1,26 @@ import socket import time -from helper.client import CommandRequest + + +def get_keeper_socket(cluster, node, port=9181): + hosts = cluster.get_instance_ip(node.name) + client = socket.socket() + client.settimeout(10) + client.connect((hosts, port)) + return client def send_4lw_cmd(cluster, node, cmd="ruok", port=9181): - return CommandRequest( - [ - "cluster.server_bin_path", - "keeper-client", - f"{cluster.get_instance_ip(node.name)}:{port}", - "-q", - cmd, - ] - ).get_answer() + client = None + try: + client = get_keeper_socket(cluster, node, port) + client.send(cmd.encode()) + data = client.recv(100_000) + data = data.decode() + return data + finally: + if client is not None: + client.close() NOT_SERVING_REQUESTS_ERROR_MSG = "This instance is not currently serving requests" diff --git a/tests/integration/test_keeper_client/__init__.py b/tests/integration/test_keeper_client/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_keeper_client/configs/keeper_config.xml b/tests/integration/test_keeper_client/configs/keeper_config.xml new file mode 100644 index 00000000000..7e912283ac0 --- /dev/null +++ b/tests/integration/test_keeper_client/configs/keeper_config.xml @@ -0,0 +1,3 @@ + + + diff --git a/tests/integration/test_keeper_client/test.py b/tests/integration/test_keeper_client/test.py new file mode 100644 index 00000000000..64ef62b6243 --- /dev/null +++ b/tests/integration/test_keeper_client/test.py @@ -0,0 +1,57 @@ +import pytest +from helpers.client import CommandRequest +from helpers.cluster import ClickHouseCluster + + +cluster = ClickHouseCluster(__file__) + +node = cluster.add_instance( + "node", + main_configs=["configs/keeper_config.xml"], + with_zookeeper=True, + stay_alive=True, +) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + yield cluster + + finally: + cluster.shutdown() + + +def test_base_commands(started_cluster): + _ = started_cluster + + command = CommandRequest( + [ + started_cluster.server_bin_path, + "keeper-client", + f"{cluster.get_instance_ip('zoo1')}:{cluster.zookeeper_port}", + "-q", + "create test_create_zk_node1 testvalue1;create test_create_zk_node_2 testvalue2;get test_create_zk_node1;", + ], + stdin="", + ) + + assert command.get_answer() == "testvalue1\n" + + +def test_four_letter_word_commands(started_cluster): + _ = started_cluster + + command = CommandRequest( + [ + started_cluster.server_bin_path, + "keeper-client", + f"{cluster.get_instance_ip('zoo1')}:{cluster.zookeeper_port}", + "-q", + "ruok", + ], + stdin="", + ) + + assert command.get_answer() == "imok\n" From 3380e467d91ca8797135b860342944b46505efd3 Mon Sep 17 00:00:00 2001 From: pufit Date: Mon, 27 Mar 2023 11:12:46 -0400 Subject: [PATCH 0020/1072] fix typo --- programs/keeper-client/KeeperClient.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/keeper-client/KeeperClient.h b/programs/keeper-client/KeeperClient.h index 10099b06021..50a8b35f6c5 100644 --- a/programs/keeper-client/KeeperClient.h +++ b/programs/keeper-client/KeeperClient.h @@ -30,7 +30,7 @@ public: protected: void runInteractive(); - void loadCommands(std::vector> && callback); + void loadCommands(std::vector> && new_commands); bool processQueryText(const String & text); void executeQuery(const String & query); From 892e436046f3f7bb135c5df8b18a1951833dd29f Mon Sep 17 00:00:00 2001 From: pufit Date: Sun, 2 Apr 2023 16:51:10 -0400 Subject: [PATCH 0021/1072] Move host and port to options --- programs/keeper-client/KeeperClient.cpp | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index 54b5cf1d6cf..52a31a388cc 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -85,9 +85,19 @@ void KeeperClient::defineOptions(Poco::Util::OptionSet & options) Poco::Util::Application::defineOptions(options); options.addOption( - Poco::Util::Option("help", "h", "show help and exit") + Poco::Util::Option("help", "", "show help and exit") .binding("help")); + options.addOption( + Poco::Util::Option("host", "h", "server hostname. default `localhost`") + .argument("host") + .binding("host")); + + options.addOption( + Poco::Util::Option("port", "p", "server port. default `2181`") + .argument("port") + .binding("port")); + options.addOption( Poco::Util::Option("query", "q", "will execute given query, then exit.") .argument("query") @@ -284,13 +294,12 @@ void KeeperClient::runInteractive() } } -int KeeperClient::main(const std::vector & args) +int KeeperClient::main(const std::vector & /* args */) { - if (args.empty()) - zk_args.hosts = {"localhost:2181"}; - else - zk_args.hosts = {args[0]}; + auto host = config().getString("host", "localhost"); + auto port = config().getString("port", "2181"); + zk_args.hosts = {host + ":" + port}; zk_args.connection_timeout_ms = config().getInt("connection-timeout", 10) * 1000; zk_args.session_timeout_ms = config().getInt("session-timeout", 10) * 1000; zk_args.operation_timeout_ms = config().getInt("operation-timeout", 10) * 1000; From 267bbcab007d02748af2b2b18c63de73c4fa327b Mon Sep 17 00:00:00 2001 From: Aleksei Golub Date: Fri, 14 Apr 2023 00:09:57 +0300 Subject: [PATCH 0022/1072] Added ability to implicitly use file table function in clickhouse-local --- programs/local/LocalServer.cpp | 3 +- src/Databases/DatabaseFactory.cpp | 21 +- src/Databases/DatabaseFileSystem.cpp | 132 +++++++++ src/Databases/DatabaseFileSystem.h | 51 ++++ src/Databases/DatabasesOverlay.cpp | 267 ++++++++++++++++++ src/Databases/DatabasesOverlay.h | 68 +++++ ...cal_implicit_file_table_function.reference | 9 + ...ouse_local_implicit_file_table_function.sh | 43 +++ 8 files changed, 591 insertions(+), 3 deletions(-) create mode 100644 src/Databases/DatabaseFileSystem.cpp create mode 100644 src/Databases/DatabaseFileSystem.h create mode 100644 src/Databases/DatabasesOverlay.cpp create mode 100644 src/Databases/DatabasesOverlay.h create mode 100644 tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.reference create mode 100755 tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.sh diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 5768e744f94..566d11791ca 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -647,7 +648,7 @@ void LocalServer::processConfig() * if such tables will not be dropped, clickhouse-server will not be able to load them due to security reasons. */ std::string default_database = config().getString("default_database", "_local"); - DatabaseCatalog::instance().attachDatabase(default_database, std::make_shared(default_database, global_context)); + DatabaseCatalog::instance().attachDatabase(default_database, CreateClickHouseLocalDatabaseOverlay(default_database, global_context)); global_context->setCurrentDatabase(default_database); applyCmdOptions(global_context); diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index 89a799349bf..b023bb06ad1 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -132,13 +133,13 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String static const std::unordered_set database_engines{"Ordinary", "Atomic", "Memory", "Dictionary", "Lazy", "Replicated", "MySQL", "MaterializeMySQL", "MaterializedMySQL", - "PostgreSQL", "MaterializedPostgreSQL", "SQLite"}; + "PostgreSQL", "MaterializedPostgreSQL", "SQLite", "FileSystem"}; if (!database_engines.contains(engine_name)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Database engine name `{}` does not exist", engine_name); static const std::unordered_set engines_with_arguments{"MySQL", "MaterializeMySQL", "MaterializedMySQL", - "Lazy", "Replicated", "PostgreSQL", "MaterializedPostgreSQL", "SQLite"}; + "Lazy", "Replicated", "PostgreSQL", "MaterializedPostgreSQL", "SQLite", "FileSystem"}; static const std::unordered_set engines_with_table_overrides{"MaterializeMySQL", "MaterializedMySQL", "MaterializedPostgreSQL"}; bool engine_may_have_arguments = engines_with_arguments.contains(engine_name); @@ -432,6 +433,22 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String return std::make_shared(context, engine_define, create.attach, database_path); } #endif + else if (engine_name == "FileSystem") { + const ASTFunction * engine = engine_define->engine; + + // If init_path is empty, then the current path from Poco will be used + std::string init_path; + + if (engine->arguments && engine->arguments->children.size() > 0) { + if (engine->arguments->children.size() != 1) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "FileSystem database requires at most 1 argument: file_system_path"); + + const auto & arguments = engine->arguments->children; + init_path = safeGetLiteralValue(arguments[0], engine_name); + } + + return std::make_shared(database_name, init_path, context); + } throw Exception(ErrorCodes::UNKNOWN_DATABASE_ENGINE, "Unknown database engine: {}", engine_name); } diff --git a/src/Databases/DatabaseFileSystem.cpp b/src/Databases/DatabaseFileSystem.cpp new file mode 100644 index 00000000000..9e2273970c3 --- /dev/null +++ b/src/Databases/DatabaseFileSystem.cpp @@ -0,0 +1,132 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +DatabaseFileSystem::DatabaseFileSystem(const String & name_, const String & path_, ContextPtr context_) + : IDatabase(name_), WithContext(context_->getGlobalContext()), path(path_), log(&Poco::Logger::get("DatabaseFileSystem(" + name_ + ")")) +{ + if (path.empty()) + path = Poco::Path::current(); +} + +std::string DatabaseFileSystem::getTablePath(const std::string& table_name) const +{ + return Poco::Path(path, table_name).toString(); +} + +void DatabaseFileSystem::addTable(const std::string& table_name, StoragePtr table_storage) const +{ + std::lock_guard lock(mutex); + loaded_tables.emplace(table_name, table_storage); +} + +bool DatabaseFileSystem::isTableExist(const String & name, ContextPtr) const +{ + { + std::lock_guard lock(mutex); + if (loaded_tables.find(name) != loaded_tables.end()) + return true; + } + + Poco::File table_file(getTablePath(name)); + return table_file.exists() && table_file.isFile(); +} + +StoragePtr DatabaseFileSystem::tryGetTable(const String & name, ContextPtr context_) const +{ + // Check if the table exists in the loaded tables map + { + std::lock_guard lock(mutex); + auto it = loaded_tables.find(name); + if (it != loaded_tables.end()) + return it->second; + } + + auto table_path = getTablePath(name); + + // If the table doesn't exist in the tables map, check if the corresponding file exists + Poco::File table_file(table_path); + if (!table_file.exists()) + return nullptr; + + // If the file exists, create a new table using TableFunctionFile and return it. + auto args = makeASTFunction("file", std::make_shared(table_path)); + + auto table_function = TableFunctionFactory::instance().get(args, context_); + if (!table_function) + return nullptr; + + auto table_storage = table_function->execute(args, context_, name); + if (table_storage) + addTable(name, table_storage); + + return table_storage; +} + +ASTPtr DatabaseFileSystem::getCreateDatabaseQuery() const +{ + auto settings = getContext()->getSettingsRef(); + ParserCreateQuery parser; + + String query = "CREATE DATABASE " + backQuoteIfNeed(getDatabaseName()) + " ENGINE = FileSystem(" + backQuoteIfNeed(path) + ")"; + ASTPtr ast = parseQuery(parser, query.data(), query.data() + query.size(), "", 0, settings.max_parser_depth); + + if (const auto database_comment = getDatabaseComment(); !database_comment.empty()) + { + auto & ast_create_query = ast->as(); + ast_create_query.set(ast_create_query.comment, std::make_shared(database_comment)); + } + + return ast; +} + +void DatabaseFileSystem::shutdown() +{ + Tables tables_snapshot; + { + std::lock_guard lock(mutex); + tables_snapshot = loaded_tables; + } + + for (const auto & kv : tables_snapshot) + { + auto table_id = kv.second->getStorageID(); + kv.second->flushAndShutdown(); + } + + std::lock_guard lock(mutex); + loaded_tables.clear(); +} + +/** + * Returns an empty vector because the database is read-only and no tables can be backed up. + */ +std::vector> DatabaseFileSystem::getTablesForBackup(const FilterByNameFunction&, const ContextPtr&) const { + return {}; +} + +/** + * + * Returns an empty iterator because the database does not have its own tables + * But only caches them for quick access. + */ +DatabaseTablesIteratorPtr DatabaseFileSystem::getTablesIterator(ContextPtr, const FilterByNameFunction&) const { + return std::make_unique(Tables{}, getDatabaseName()); +} + +} // DB diff --git a/src/Databases/DatabaseFileSystem.h b/src/Databases/DatabaseFileSystem.h new file mode 100644 index 00000000000..474a7e78335 --- /dev/null +++ b/src/Databases/DatabaseFileSystem.h @@ -0,0 +1,51 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace DB +{ + +class Context; + +/** + * DatabaseFileSystem allows to interact with files stored on the file system + * Uses TableFunctionFile to implicitly load file when a user requests the table, and provides read-only access to the data in the file + * Tables are cached inside the database for quick access + * + * Used in clickhouse-local to access local files + */ +class DatabaseFileSystem : public IDatabase, protected WithContext +{ +public: + DatabaseFileSystem(const String & name, const String & path, ContextPtr context); + + String getEngineName() const override { return "FileSystem"; } + + bool isTableExist(const String & name, ContextPtr context) const override; + + StoragePtr tryGetTable(const String & name, ContextPtr context) const override; + + bool empty() const override { return true; } + + ASTPtr getCreateDatabaseQuery() const override; + + void shutdown() override; + + std::vector> getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const override; + DatabaseTablesIteratorPtr getTablesIterator(ContextPtr, const FilterByNameFunction &) const override; + +protected: + std::string getTablePath(const std::string & table_name) const; + void addTable(const std::string & table_name, StoragePtr table_storage) const; + +private: + String path; + mutable Tables loaded_tables TSA_GUARDED_BY(mutex); + Poco::Logger * log; +}; + +} // DB diff --git a/src/Databases/DatabasesOverlay.cpp b/src/Databases/DatabasesOverlay.cpp new file mode 100644 index 00000000000..9c3d802df73 --- /dev/null +++ b/src/Databases/DatabasesOverlay.cpp @@ -0,0 +1,267 @@ +#include + +#include +#include +#include + +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; + extern const int CANNOT_GET_CREATE_TABLE_QUERY; +} + +DatabasesOverlay::DatabasesOverlay(const String & name_, ContextPtr context_) + : IDatabase(name_), WithContext(context_->getGlobalContext()), log(&Poco::Logger::get("DatabaseOverlay(" + name_ + ")")) +{ +} + +DatabasesOverlay & DatabasesOverlay::registerNextDatabase(DatabasePtr database) +{ + databases.push_back(std::move(database)); + return *this; +} + +bool DatabasesOverlay::isTableExist(const String & table_name, ContextPtr context_) const +{ + for (const auto & db : databases) + { + if (db->isTableExist(table_name, context_)) + return true; + } + return false; +} + +StoragePtr DatabasesOverlay::tryGetTable(const String & table_name, ContextPtr context_) const +{ + StoragePtr result = nullptr; + for (const auto & db : databases) + { + result = db->tryGetTable(table_name, context_); + if (result) + break; + } + return result; +} + +void DatabasesOverlay::createTable(ContextPtr context_, const String & table_name, const StoragePtr & table, const ASTPtr & query) +{ + for (auto & db : databases) + { + try + { + db->createTable(context_, table_name, table, query); + return; + } + catch (...) + { + continue; + } + } + throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no databases for CREATE TABLE {} query in Database{}", table_name, getEngineName()); +} + +void DatabasesOverlay::dropTable(ContextPtr context_, const String & table_name, bool sync) +{ + for (auto & db : databases) + { + try + { + db->dropTable(context_, table_name, sync); + return; + } + catch (...) + { + continue; + } + } + throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no databases for DROP TABLE {} query in Database{}", table_name, getEngineName()); +} + +void DatabasesOverlay::attachTable( + ContextPtr context_, const String & table_name, const StoragePtr & table, const String & relative_table_path) +{ + for (auto & db : databases) + { + try + { + db->attachTable(context_, table_name, table, relative_table_path); + return; + } + catch (...) + { + continue; + } + } + throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no databases for ATTACH TABLE query in Database{}", getEngineName()); +} + +StoragePtr DatabasesOverlay::detachTable(ContextPtr context_, const String & table_name) +{ + StoragePtr result = nullptr; + for (auto & db : databases) + { + try + { + result = db->detachTable(context_, table_name); + if (result) + return result; + } + catch (...) + { + continue; + } + } + throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no databases for DETACH TABLE {} query in Database{}", table_name, getEngineName()); +} + +ASTPtr DatabasesOverlay::getCreateTableQueryImpl(const String & name, ContextPtr context_, bool throw_on_error) const +{ + ASTPtr result = nullptr; + for (const auto & db : databases) + { + result = db->tryGetCreateTableQuery(name, context_); + if (result) + break; + } + if (!result && throw_on_error) + throw Exception(ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY, "There is no metadata of table {} in Database{}", name, getEngineName()); + return result; +} + +/* + * DatabaseOverlay cannot be constructed by "CREATE DATABASE" query, as it is not a traditional ClickHouse database + * To use DatabaseOverlay, it must be constructed programmatically in code + */ +ASTPtr DatabasesOverlay::getCreateDatabaseQuery() const +{ + return std::make_shared(); +} + +String DatabasesOverlay::getTableDataPath(const String & table_name) const +{ + String result; + for (const auto & db : databases) + { + result = db->getTableDataPath(table_name); + if (!result.empty()) + break; + } + return result; +} + +String DatabasesOverlay::getTableDataPath(const ASTCreateQuery & query) const +{ + String result; + for (const auto & db : databases) + { + result = db->getTableDataPath(query); + if (!result.empty()) + break; + } + return result; +} + +UUID DatabasesOverlay::tryGetTableUUID(const String & table_name) const +{ + UUID result = UUIDHelpers::Nil; + for (const auto & db : databases) + { + result = db->tryGetTableUUID(table_name); + if (result != UUIDHelpers::Nil) + break; + } + return result; +} + +void DatabasesOverlay::drop(ContextPtr context_) +{ + for (auto & db : databases) + db->drop(context_); +} + +void DatabasesOverlay::alterTable(ContextPtr local_context, const StorageID & table_id, const StorageInMemoryMetadata & metadata) +{ + for (auto & db : databases) + { + try + { + db->alterTable(local_context, table_id, metadata); + return; + } + catch (...) + { + continue; + } + } + throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no databases for alterTable in Database{}", getEngineName()); +} + +std::vector> +DatabasesOverlay::getTablesForBackup(const FilterByNameFunction & filter, const ContextPtr & local_context) const +{ + std::vector> result; + for (const auto & db : databases) + { + auto dbBackup = db->getTablesForBackup(filter, local_context); + result.insert(result.end(), std::make_move_iterator(dbBackup.begin()), std::make_move_iterator(dbBackup.end())); + } + return result; +} + +void DatabasesOverlay::createTableRestoredFromBackup( + const ASTPtr & create_table_query, + ContextMutablePtr local_context, + std::shared_ptr /*restore_coordination*/, + UInt64 /*timeout_ms*/) +{ + /// Creates a tables by executing a "CREATE TABLE" query. + InterpreterCreateQuery interpreter{create_table_query, local_context}; + interpreter.setInternal(true); + interpreter.execute(); +} + +bool DatabasesOverlay::empty() const +{ + for (const auto & db : databases) + { + if (!db->empty()) + return false; + } + return true; +} + +void DatabasesOverlay::shutdown() +{ + for (auto & db : databases) + db->shutdown(); +} + +DatabaseTablesIteratorPtr DatabasesOverlay::getTablesIterator(ContextPtr context_, const FilterByNameFunction & filter_by_table_name) const +{ + Tables tables; + for (const auto & db : databases) + { + for (auto table_it = db->getTablesIterator(context_, filter_by_table_name); table_it->isValid(); table_it->next()) + tables.insert({table_it->name(), table_it->table()}); + } + return std::make_unique(std::move(tables), getDatabaseName()); +} + +DatabasePtr CreateClickHouseLocalDatabaseOverlay(const String & name_, ContextPtr context_) +{ + auto databaseCombiner = std::make_shared(name_, context_); + databaseCombiner->registerNextDatabase(std::make_shared(name_, "", context_)); + databaseCombiner->registerNextDatabase(std::make_shared(name_, context_)); + return databaseCombiner; +} + +} diff --git a/src/Databases/DatabasesOverlay.h b/src/Databases/DatabasesOverlay.h new file mode 100644 index 00000000000..77f0085161b --- /dev/null +++ b/src/Databases/DatabasesOverlay.h @@ -0,0 +1,68 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/** + * Implements the IDatabase interface and combines multiple other databases + * Searches for tables in each database in order until found, and delegates operations to the appropriate database + * Useful for combining databases + * + * Used in clickhouse-local to combine DatabaseFileSystem and DatabaseMemory + */ +class DatabasesOverlay : public IDatabase, protected WithContext +{ +public: + DatabasesOverlay(const String & name_, ContextPtr context_); + + /// Not thread-safe. Use only as factory to initialize database + DatabasesOverlay & registerNextDatabase(DatabasePtr database); + + String getEngineName() const override { return "Overlay"; } + +public: + bool isTableExist(const String & table_name, ContextPtr context) const override; + + StoragePtr tryGetTable(const String & table_name, ContextPtr context) const override; + + void createTable(ContextPtr context, const String & table_name, const StoragePtr & table, const ASTPtr & query) override; + + void dropTable(ContextPtr context, const String & table_name, bool sync) override; + + void attachTable(ContextPtr context, const String & table_name, const StoragePtr & table, const String & relative_table_path) override; + + StoragePtr detachTable(ContextPtr context, const String & table_name) override; + + ASTPtr getCreateTableQueryImpl(const String & name, ContextPtr context, bool throw_on_error) const override; + ASTPtr getCreateDatabaseQuery() const override; + + String getTableDataPath(const String & table_name) const override; + String getTableDataPath(const ASTCreateQuery & query) const override; + + UUID tryGetTableUUID(const String & table_name) const override; + + void drop(ContextPtr context) override; + + void alterTable(ContextPtr local_context, const StorageID & table_id, const StorageInMemoryMetadata & metadata) override; + + std::vector> getTablesForBackup(const FilterByNameFunction & filter, const ContextPtr & local_context) const override; + + void createTableRestoredFromBackup(const ASTPtr & create_table_query, ContextMutablePtr local_context, std::shared_ptr restore_coordination, UInt64 timeout_ms) override; + + DatabaseTablesIteratorPtr getTablesIterator(ContextPtr context, const FilterByNameFunction & filter_by_table_name) const override; + + bool empty() const override; + + void shutdown() override; + +protected: + std::vector databases; + Poco::Logger * log; +}; + +DatabasePtr CreateClickHouseLocalDatabaseOverlay(const String & name_, ContextPtr context_); + +} diff --git a/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.reference b/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.reference new file mode 100644 index 00000000000..0fcd843e737 --- /dev/null +++ b/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.reference @@ -0,0 +1,9 @@ +Test 1: check explicit and implicit call of the file table function +explicit: +4 +implicit: +4 +Test 2: check FileSystem database +4 +Test 3: check show database with FileSystem +test02707 diff --git a/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.sh b/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.sh new file mode 100755 index 00000000000..4d8d7b1395a --- /dev/null +++ b/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +dir=02707_clickhouse_local_tmp +[[ -d $dir ]] && rm -r $dir +mkdir $dir +# Create temporary csv file for tests +echo '"id","str","int","text"' > $dir/tmp.csv +echo '1,"abc",123,"abacaba"' >> $dir/tmp.csv +echo '2,"def",456,"bacabaa"' >> $dir/tmp.csv +echo '3,"story",78912,"acabaab"' >> $dir/tmp.csv +echo '4,"history",21321321,"cabaaba"' >> $dir/tmp.csv + +################# +echo "Test 1: check explicit and implicit call of the file table function" + +echo "explicit:" +$CLICKHOUSE_LOCAL -q 'SELECT COUNT(*) FROM file("02707_clickhouse_local_tmp/tmp.csv")' +echo "implicit:" +$CLICKHOUSE_LOCAL -q 'SELECT COUNT(*) FROM "02707_clickhouse_local_tmp/tmp.csv"' + +################# +echo "Test 2: check FileSystem database" +$CLICKHOUSE_LOCAL --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test; +CREATE DATABASE test ENGINE = FileSystem('02707_clickhouse_local_tmp'); +SELECT COUNT(*) FROM test.\`tmp.csv\`; +DROP DATABASE test; +""" + +################# +echo "Test 3: check show database with FileSystem" +$CLICKHOUSE_LOCAL --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test02707; +CREATE DATABASE test02707 ENGINE = FileSystem('02707_clickhouse_local_tmp'); +SHOW DATABASES; +DROP DATABASE test02707; +""" | grep "test02707" + +rm -r $dir \ No newline at end of file From 96553bc3d8e70d06e03191f4b848ed07c91e5c6a Mon Sep 17 00:00:00 2001 From: Aleksei Golub Date: Sun, 16 Apr 2023 23:25:57 +0300 Subject: [PATCH 0023/1072] Fix style and tests --- src/Databases/DatabaseFactory.cpp | 6 ++- src/Databases/DatabaseFileSystem.cpp | 45 +++++++++++-------- src/Databases/DatabasesOverlay.cpp | 1 - ...ouse_local_implicit_file_table_function.sh | 14 +++--- 4 files changed, 38 insertions(+), 28 deletions(-) diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index b023bb06ad1..9c13881fc7b 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -433,13 +433,15 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String return std::make_shared(context, engine_define, create.attach, database_path); } #endif - else if (engine_name == "FileSystem") { + else if (engine_name == "FileSystem") + { const ASTFunction * engine = engine_define->engine; // If init_path is empty, then the current path from Poco will be used std::string init_path; - if (engine->arguments && engine->arguments->children.size() > 0) { + if (engine->arguments && !engine->arguments->children.empty()) + { if (engine->arguments->children.size() != 1) throw Exception(ErrorCodes::BAD_ARGUMENTS, "FileSystem database requires at most 1 argument: file_system_path"); diff --git a/src/Databases/DatabaseFileSystem.cpp b/src/Databases/DatabaseFileSystem.cpp index 9e2273970c3..8b92ad8080a 100644 --- a/src/Databases/DatabaseFileSystem.cpp +++ b/src/Databases/DatabaseFileSystem.cpp @@ -59,23 +59,30 @@ StoragePtr DatabaseFileSystem::tryGetTable(const String & name, ContextPtr conte auto table_path = getTablePath(name); - // If the table doesn't exist in the tables map, check if the corresponding file exists - Poco::File table_file(table_path); - if (!table_file.exists()) + try + { + // If the table doesn't exist in the tables map, check if the corresponding file exists + Poco::File table_file(table_path); + if (!table_file.exists()) + return nullptr; + + // If the file exists, create a new table using TableFunctionFile and return it. + auto args = makeASTFunction("file", std::make_shared(table_path)); + + auto table_function = TableFunctionFactory::instance().get(args, context_); + if (!table_function) + return nullptr; + + auto table_storage = table_function->execute(args, context_, name); + if (table_storage) + addTable(name, table_storage); + + return table_storage; + } + catch (...) + { return nullptr; - - // If the file exists, create a new table using TableFunctionFile and return it. - auto args = makeASTFunction("file", std::make_shared(table_path)); - - auto table_function = TableFunctionFactory::instance().get(args, context_); - if (!table_function) - return nullptr; - - auto table_storage = table_function->execute(args, context_, name); - if (table_storage) - addTable(name, table_storage); - - return table_storage; + } } ASTPtr DatabaseFileSystem::getCreateDatabaseQuery() const @@ -116,7 +123,8 @@ void DatabaseFileSystem::shutdown() /** * Returns an empty vector because the database is read-only and no tables can be backed up. */ -std::vector> DatabaseFileSystem::getTablesForBackup(const FilterByNameFunction&, const ContextPtr&) const { +std::vector> DatabaseFileSystem::getTablesForBackup(const FilterByNameFunction&, const ContextPtr&) const +{ return {}; } @@ -125,7 +133,8 @@ std::vector> DatabaseFileSystem::getTablesForBacku * Returns an empty iterator because the database does not have its own tables * But only caches them for quick access. */ -DatabaseTablesIteratorPtr DatabaseFileSystem::getTablesIterator(ContextPtr, const FilterByNameFunction&) const { +DatabaseTablesIteratorPtr DatabaseFileSystem::getTablesIterator(ContextPtr, const FilterByNameFunction&) const +{ return std::make_unique(Tables{}, getDatabaseName()); } diff --git a/src/Databases/DatabasesOverlay.cpp b/src/Databases/DatabasesOverlay.cpp index 9c3d802df73..da26f9282a0 100644 --- a/src/Databases/DatabasesOverlay.cpp +++ b/src/Databases/DatabasesOverlay.cpp @@ -14,7 +14,6 @@ namespace DB namespace ErrorCodes { - extern const int NOT_IMPLEMENTED; extern const int LOGICAL_ERROR; extern const int CANNOT_GET_CREATE_TABLE_QUERY; } diff --git a/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.sh b/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.sh index 4d8d7b1395a..eea1e47ba7f 100755 --- a/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.sh +++ b/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.sh @@ -4,8 +4,8 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -dir=02707_clickhouse_local_tmp -[[ -d $dir ]] && rm -r $dir +dir=${CLICKHOUSE_TEST_UNIQUE_NAME} +[[ -d $dir ]] && rm -rd $dir mkdir $dir # Create temporary csv file for tests echo '"id","str","int","text"' > $dir/tmp.csv @@ -18,15 +18,15 @@ echo '4,"history",21321321,"cabaaba"' >> $dir/tmp.csv echo "Test 1: check explicit and implicit call of the file table function" echo "explicit:" -$CLICKHOUSE_LOCAL -q 'SELECT COUNT(*) FROM file("02707_clickhouse_local_tmp/tmp.csv")' +$CLICKHOUSE_LOCAL -q "SELECT COUNT(*) FROM file('${dir}/tmp.csv')" echo "implicit:" -$CLICKHOUSE_LOCAL -q 'SELECT COUNT(*) FROM "02707_clickhouse_local_tmp/tmp.csv"' +$CLICKHOUSE_LOCAL -q "SELECT COUNT(*) FROM \"${dir}/tmp.csv\"" ################# echo "Test 2: check FileSystem database" $CLICKHOUSE_LOCAL --multiline --multiquery -q """ DROP DATABASE IF EXISTS test; -CREATE DATABASE test ENGINE = FileSystem('02707_clickhouse_local_tmp'); +CREATE DATABASE test ENGINE = FileSystem('${dir}'); SELECT COUNT(*) FROM test.\`tmp.csv\`; DROP DATABASE test; """ @@ -35,9 +35,9 @@ DROP DATABASE test; echo "Test 3: check show database with FileSystem" $CLICKHOUSE_LOCAL --multiline --multiquery -q """ DROP DATABASE IF EXISTS test02707; -CREATE DATABASE test02707 ENGINE = FileSystem('02707_clickhouse_local_tmp'); +CREATE DATABASE test02707 ENGINE = FileSystem('${dir}'); SHOW DATABASES; DROP DATABASE test02707; """ | grep "test02707" -rm -r $dir \ No newline at end of file +rm -rd $dir From 21d5846cabd0717184f44d98b8480fefc683e807 Mon Sep 17 00:00:00 2001 From: Aleksei Golub Date: Tue, 18 Apr 2023 18:12:11 +0300 Subject: [PATCH 0024/1072] Fix test --- .../02707_clickhouse_local_implicit_file_table_function.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.sh b/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.sh index eea1e47ba7f..24de0ad579c 100755 --- a/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.sh +++ b/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.sh @@ -7,6 +7,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) dir=${CLICKHOUSE_TEST_UNIQUE_NAME} [[ -d $dir ]] && rm -rd $dir mkdir $dir + # Create temporary csv file for tests echo '"id","str","int","text"' > $dir/tmp.csv echo '1,"abc",123,"abacaba"' >> $dir/tmp.csv @@ -40,4 +41,5 @@ SHOW DATABASES; DROP DATABASE test02707; """ | grep "test02707" +# Remove temporary dir with files rm -rd $dir From 8bef8fc1de5acf9910f83b978c8b91768da7f670 Mon Sep 17 00:00:00 2001 From: pufit Date: Wed, 26 Apr 2023 00:54:28 -0400 Subject: [PATCH 0025/1072] Parsing Keeper commands via ClickHouse Parser --- programs/keeper-client/CMakeLists.txt | 2 +- programs/keeper-client/Commands.cpp | 184 ++++++++++++++++++++++++ programs/keeper-client/Commands.h | 131 +++++++++++++++++ programs/keeper-client/KeeperClient.cpp | 122 +++++----------- programs/keeper-client/KeeperClient.h | 39 ++--- programs/keeper-client/Parser.cpp | 94 ++++++++++++ programs/keeper-client/Parser.h | 36 +++++ src/Parsers/TokenIterator.cpp | 4 +- src/Parsers/TokenIterator.h | 2 +- src/Parsers/parseQuery.cpp | 5 +- src/Parsers/parseQuery.h | 3 +- 11 files changed, 514 insertions(+), 108 deletions(-) create mode 100644 programs/keeper-client/Commands.cpp create mode 100644 programs/keeper-client/Commands.h create mode 100644 programs/keeper-client/Parser.cpp create mode 100644 programs/keeper-client/Parser.h diff --git a/programs/keeper-client/CMakeLists.txt b/programs/keeper-client/CMakeLists.txt index 06055d6d820..f54892fe559 100644 --- a/programs/keeper-client/CMakeLists.txt +++ b/programs/keeper-client/CMakeLists.txt @@ -1,4 +1,4 @@ -set (CLICKHOUSE_KEEPER_CLIENT_SOURCES KeeperClient.cpp) +set (CLICKHOUSE_KEEPER_CLIENT_SOURCES KeeperClient.cpp Parser.cpp Commands.cpp) set (CLICKHOUSE_KEEPER_CLIENT_LINK PRIVATE diff --git a/programs/keeper-client/Commands.cpp b/programs/keeper-client/Commands.cpp new file mode 100644 index 00000000000..a21550e969d --- /dev/null +++ b/programs/keeper-client/Commands.cpp @@ -0,0 +1,184 @@ + +#include "Commands.h" +#include "KeeperClient.h" + + +namespace DB +{ + +bool LSCommand::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const +{ + String arg; + if (!parseKeeperPath(pos, expected, arg)) + return true; + + node->args.push_back(std::move(arg)); + return true; +} + +void LSCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) const +{ + String path; + if (!query->args.empty()) + path = client->getAbsolutePath(query->args[0].safeGet()); + else + path = client->cwd; + + const auto children = client->zookeeper->getChildren(path); + for (const auto & child : children) + std::cout << child << " "; + std::cout << "\n"; +} + +bool CDCommand::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const +{ + String arg; + if (!parseKeeperPath(pos, expected, arg)) + return true; + + node->args.push_back(std::move(arg)); + return true; +} + +void CDCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) const +{ + if (!query->args.empty()) + return; + + auto new_path = client->getAbsolutePath(query->args[0].safeGet()); + if (!client->zookeeper->exists(new_path)) + std::cerr << "Path " << new_path << " does not exists\n"; + else + client->cwd = new_path; +} + +bool SetCommand::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const +{ + String arg; + if (!parseKeeperPath(pos, expected, arg)) + return false; + node->args.push_back(std::move(arg)); + + if (!parseKeeperArg(pos, expected, arg)) + return false; + node->args.push_back(std::move(arg)); + + ASTPtr version; + if (ParserNumber{}.parse(pos, version, expected)) + node->args.push_back(version->as().value); + + return true; +} + +void SetCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) const +{ + if (query->args.size() == 2) + client->zookeeper->set(client->getAbsolutePath(query->args[0].safeGet()), query->args[1].safeGet()); + else + client->zookeeper->set( + client->getAbsolutePath(query->args[0].safeGet()), + query->args[1].safeGet(), + static_cast(query->args[2].safeGet())); +} + +bool CreateCommand::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const +{ + String arg; + if (!parseKeeperPath(pos, expected, arg)) + return false; + node->args.push_back(std::move(arg)); + + if (!parseKeeperArg(pos, expected, arg)) + return false; + node->args.push_back(std::move(arg)); + + return true; +} + +void CreateCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) const +{ + client->zookeeper->create( + client->getAbsolutePath(query->args[0].safeGet()), + query->args[1].safeGet(), + zkutil::CreateMode::Persistent); +} + +bool GetCommand::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const +{ + String arg; + if (!parseKeeperPath(pos, expected, arg)) + return false; + node->args.push_back(std::move(arg)); + + return true; +} + +void GetCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) const +{ + std::cout << client->zookeeper->get(client->getAbsolutePath(query->args[0].safeGet())) << "\n"; +} + +bool RMCommand::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const +{ + String arg; + if (!parseKeeperPath(pos, expected, arg)) + return false; + node->args.push_back(std::move(arg)); + + return true; +} + +void RMCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) const +{ + client->zookeeper->remove(client->getAbsolutePath(query->args[0].safeGet())); +} + +bool RMRCommand::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const +{ + String arg; + if (!parseKeeperPath(pos, expected, arg)) + return false; + node->args.push_back(std::move(arg)); + + return true; +} + +void RMRCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) const +{ + String path = client->getAbsolutePath(query->args[0].safeGet()); + client->askConfirmation("You are going to recursively delete path " + path, + [client, path]{ client->zookeeper->removeRecursive(path); }); +} + +bool HelpCommand::parse(IParser::Pos & /* pos */, std::shared_ptr & /* node */, Expected & /* expected */) const +{ + return true; +} + +void HelpCommand::execute(const ASTKeeperQuery * /* query */, KeeperClient * /* client */) const +{ + for (const auto & pair : KeeperClient::commands) + std::cout << pair.second->getHelpMessage() << '\n'; +} + +bool FourLetterWordCommand::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const +{ + expected.add(pos, "four-letter-word command"); + if (pos->type != TokenType::BareWord) + return false; + + String cmd(pos->begin, pos->end); + if (cmd.size() != 4) + return false; + + ++pos; + node->args.push_back(std::move(cmd)); + return true; +} + +void FourLetterWordCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) const +{ + std::cout << client->executeFourLetterCommand(query->args[0].safeGet()) << "\n"; +} + +} diff --git a/programs/keeper-client/Commands.h b/programs/keeper-client/Commands.h new file mode 100644 index 00000000000..e4debd53e42 --- /dev/null +++ b/programs/keeper-client/Commands.h @@ -0,0 +1,131 @@ +#pragma once + +#include "Parser.h" + +namespace DB +{ + +class KeeperClient; + +class IKeeperClientCommand +{ +public: + static const String name; + + virtual bool parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const = 0; + + virtual void execute(const ASTKeeperQuery * query, KeeperClient * client) const = 0; + + virtual String getHelpMessage() const = 0; + + virtual String getName() const = 0; + + virtual ~IKeeperClientCommand() = default; +}; + +using Command = std::shared_ptr; + + +class LSCommand : public IKeeperClientCommand +{ + String getName() const override { return "ls"; } + + bool parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const override; + + void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; + + String getHelpMessage() const override { return "ls [path] -- Lists the nodes for the given path (default: cwd)"; } +}; + +class CDCommand : public IKeeperClientCommand +{ + String getName() const override { return "cd"; } + + bool parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const override; + + void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; + + String getHelpMessage() const override { return "cd [path] -- Change the working path (default `.`)"; } +}; + +class SetCommand : public IKeeperClientCommand +{ + String getName() const override { return "set"; } + + bool parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const override; + + void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; + + String getHelpMessage() const override + { + return "set [version] -- Updates the node's value. Only update if version matches (default: -1)"; + } +}; + +class CreateCommand : public IKeeperClientCommand +{ + String getName() const override { return "create"; } + + bool parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const override; + + void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; + + String getHelpMessage() const override { return "create -- Creates new node"; } +}; + +class GetCommand : public IKeeperClientCommand +{ + String getName() const override { return "get"; } + + bool parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const override; + + void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; + + String getHelpMessage() const override { return "get -- Returns the node's value"; } +}; + +class RMCommand : public IKeeperClientCommand +{ + String getName() const override { return "rm"; } + + bool parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const override; + + void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; + + String getHelpMessage() const override { return "remove -- Remove the node"; } +}; + +class RMRCommand : public IKeeperClientCommand +{ + String getName() const override { return "rmr"; } + + bool parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const override; + + void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; + + String getHelpMessage() const override { return "rmr -- Recursively deletes path. Confirmation required"; } +}; + +class HelpCommand : public IKeeperClientCommand +{ + String getName() const override { return "help"; } + + bool parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const override; + + void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; + + String getHelpMessage() const override { return "help -- Prints this message"; } +}; + +class FourLetterWordCommand : public IKeeperClientCommand +{ + String getName() const override { return "flwc"; } + + bool parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const override; + + void execute(const ASTKeeperQuery * query, KeeperClient * client) const override; + + String getHelpMessage() const override { return "flwc -- Executes four-letter-word command"; } +}; + +} diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index 52a31a388cc..92aa822231d 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -1,9 +1,11 @@ #include "KeeperClient.h" +#include "Commands.h" #include #include #include #include #include +#include #include @@ -13,13 +15,6 @@ namespace fs = std::filesystem; namespace DB { -static const NameSet four_letter_word_commands -{ - "ruok", "mntr", "srvr", "stat", "srst", "conf", - "cons", "crst", "envi", "dirs", "isro", "wchs", - "wchc", "wchp", "dump", "csnp", "lgif", "rqld", -}; - namespace ErrorCodes { extern const int BAD_ARGUMENTS; @@ -54,7 +49,7 @@ void KeeperClient::askConfirmation(const String & prompt, std::function confirmation_callback = callback; } -String KeeperClient::getAbsolutePath(const String & relative) +String KeeperClient::getAbsolutePath(const String & relative) const { String result; if (relative.starts_with('/')) @@ -68,16 +63,20 @@ String KeeperClient::getAbsolutePath(const String & relative) return result; } -void KeeperClient::loadCommands(std::vector> && new_commands) +void KeeperClient::loadCommands(std::vector && new_commands) { - for (const auto & [name, args_count, callback] : new_commands) + std::vector suggestions; + for (const auto & command : new_commands) { - commands.insert({{name, args_count}, callback}); - suggest.addWords({name}); + String name = command->getName(); + commands.insert({name, command}); + suggestions.push_back(std::move(name)); } for (const auto & command : four_letter_word_commands) - suggest.addWords({command}); + suggestions.push_back(command); + + suggest.addWords(std::move(suggestions)); } void KeeperClient::defineOptions(Poco::Util::OptionSet & options) @@ -132,61 +131,15 @@ void KeeperClient::defineOptions(Poco::Util::OptionSet & options) void KeeperClient::initialize(Poco::Util::Application & /* self */) { loadCommands({ - {"set", 2, [](KeeperClient * client, const std::vector & args) - { - client->zookeeper->set(client->getAbsolutePath(args[1]), args[2]); - }}, - - {"create", 2, [](KeeperClient * client, const std::vector & args) - { - client->zookeeper->create(client->getAbsolutePath(args[1]), args[2], zkutil::CreateMode::Persistent); - }}, - - {"get", 1, [](KeeperClient * client, const std::vector & args) - { - std::cout << client->zookeeper->get(client->getAbsolutePath(args[1])) << "\n"; - }}, - - {"ls", 0, [](KeeperClient * client, const std::vector & /* args */) - { - auto children = client->zookeeper->getChildren(client->cwd); - for (auto & child : children) - std::cout << child << " "; - std::cout << "\n"; - }}, - - {"ls", 1, [](KeeperClient * client, const std::vector & args) - { - auto children = client->zookeeper->getChildren(client->getAbsolutePath(args[1])); - for (auto & child : children) - std::cout << child << " "; - std::cout << "\n"; - }}, - - {"cd", 0, [](KeeperClient * /* client */, const std::vector & /* args */) - { - }}, - - {"cd", 1, [](KeeperClient * client, const std::vector & args) - { - auto new_path = client->getAbsolutePath(args[1]); - if (!client->zookeeper->exists(new_path)) - std::cerr << "Path " << new_path << " does not exists\n"; - else - client->cwd = new_path; - }}, - - {"rm", 1, [](KeeperClient * client, const std::vector & args) - { - client->zookeeper->remove(client->getAbsolutePath(args[1])); - }}, - - {"rmr", 1, [](KeeperClient * client, const std::vector & args) - { - String path = client->getAbsolutePath(args[1]); - client->askConfirmation("You are going to recursively delete path " + path, - [client, path]{ client->zookeeper->removeRecursive(path); }); - }}, + std::make_shared(), + std::make_shared(), + std::make_shared(), + std::make_shared(), + std::make_shared(), + std::make_shared(), + std::make_shared(), + std::make_shared(), + std::make_shared(), }); String home_path; @@ -234,32 +187,31 @@ bool KeeperClient::processQueryText(const String & text) if (exit_strings.find(text) != exit_strings.end()) return false; - std::vector tokens; - boost::algorithm::split(tokens, text, boost::is_any_of(" ")); - try { if (need_confirmation) { need_confirmation = false; - if (tokens.size() == 1 && (tokens[0] == "y" || tokens[0] == "Y")) + if (text.size() == 1 && (text == "y" || text == "Y")) confirmation_callback(); + return true; } - else if (tokens.size() == 1 && tokens[0].size() == 4 && four_letter_word_commands.find(tokens[0]) != four_letter_word_commands.end()) - std::cout << executeFourLetterCommand(tokens[0]) << "\n"; - else + + KeeperParser parser; + String message; + const char * begin = text.data(); + ASTPtr res = tryParseQuery(parser, begin, begin + text.size(), message, true, "", false, 0, 0, false); + + if (!res) { - auto callback = commands.find({tokens[0], tokens.size() - 1}); - if (callback == commands.end()) - { - if (tokens[0].size() == 4 && tokens.size() == 1) /// Treat it like unrecognized four-letter command - std::cout << executeFourLetterCommand(tokens[0]) << "\n"; - else - std::cerr << "No command found with name " << tokens[0] << " and args count " << tokens.size() - 1 << "\n"; - } - else - callback->second(this, tokens); + std::cerr << message << "\n"; + return true; } + + auto * query = res->as(); + + auto command = KeeperClient::commands.find(query->command); + command->second->execute(query, this); } catch (Coordination::Exception & err) { diff --git a/programs/keeper-client/KeeperClient.h b/programs/keeper-client/KeeperClient.h index 50a8b35f6c5..0297491bd28 100644 --- a/programs/keeper-client/KeeperClient.h +++ b/programs/keeper-client/KeeperClient.h @@ -1,10 +1,12 @@ #pragma once - +#include "Parser.h" +#include "Commands.h" #include #include #include #include +#include #include #include #include @@ -13,13 +15,16 @@ namespace DB { -class KeeperClient; +static const NameSet four_letter_word_commands + { + "ruok", "mntr", "srvr", "stat", "srst", "conf", + "cons", "crst", "envi", "dirs", "isro", "wchs", + "wchc", "wchp", "dump", "csnp", "lgif", "rqld", + }; class KeeperClient: public Poco::Util::Application { public: - using Callback = std::function &)>; - KeeperClient() = default; void initialize(Poco::Util::Application & self) override; @@ -28,29 +33,31 @@ public: void defineOptions(Poco::Util::OptionSet & options) override; -protected: - void runInteractive(); - void loadCommands(std::vector> && new_commands); - bool processQueryText(const String & text); - void executeQuery(const String & query); + String getAbsolutePath(const String & relative) const; + + void askConfirmation(const String & prompt, std::function && callback); String executeFourLetterCommand(const String & command); - String getAbsolutePath(const String & relative); - void askConfirmation(const String & prompt, std::function && callback); + zkutil::ZooKeeperPtr zookeeper; + std::filesystem::path cwd = "/"; + std::function confirmation_callback; - std::map, Callback> commands; + inline static std::map commands; + +protected: + void runInteractive(); + bool processQueryText(const String & text); + void executeQuery(const String & query); + + void loadCommands(std::vector && new_commands); String history_file; LineReader::Suggest suggest; - zkutil::ZooKeeperPtr zookeeper; zkutil::ZooKeeperArgs zk_args; - std::filesystem::path cwd = "/"; - bool need_confirmation = false; - std::function confirmation_callback; }; } diff --git a/programs/keeper-client/Parser.cpp b/programs/keeper-client/Parser.cpp new file mode 100644 index 00000000000..0f3fc39704e --- /dev/null +++ b/programs/keeper-client/Parser.cpp @@ -0,0 +1,94 @@ +#include "Parser.h" +#include "KeeperClient.h" + + +namespace DB +{ + +bool parseKeeperArg(IParser::Pos & pos, Expected & expected, String & result) +{ + expected.add(pos, getTokenName(TokenType::BareWord)); + + if (pos->type == TokenType::BareWord) + { + result = String(pos->begin, pos->end); + ++pos; + ParserToken{TokenType::Whitespace}.ignore(pos); + return true; + } + + bool status = parseIdentifierOrStringLiteral(pos, expected, result); + ParserToken{TokenType::Whitespace}.ignore(pos); + return status; +} + +bool parseKeeperPath(IParser::Pos & pos, Expected & expected, String & path) +{ + expected.add(pos, "path"); + + if (pos->type == TokenType::QuotedIdentifier || pos->type == TokenType::StringLiteral) + return parseIdentifierOrStringLiteral(pos, expected, path); + + String result; + while (pos->type == TokenType::BareWord || pos->type == TokenType::Slash || pos->type == TokenType::Dot) + { + result.append(pos->begin, pos->end); + ++pos; + } + ParserToken{TokenType::Whitespace}.ignore(pos); + + if (result.empty()) + return false; + + path = result; + return true; +} + +bool KeeperParser::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +{ + auto query = std::make_shared(); + + for (const auto & pair : KeeperClient::commands) + expected.add(pos, pair.first.data()); + + for (const auto & flwc : four_letter_word_commands) + expected.add(pos, flwc.data()); + + if (pos->type != TokenType::BareWord) + return false; + + String command_name(pos->begin, pos->end); + Command command; + + auto iter = KeeperClient::commands.find(command_name); + if (iter == KeeperClient::commands.end()) + { + if (command_name.size() == 4) + { + /// Treat it like four-letter command + /// Since keeper server can potentially have different version we don't want to match this command with embedded list + command = std::make_shared(); + command_name = command->getName(); + /// We also don't move the position, so the command will be parsed as an argument + } + else + return false; + } + else + { + command = iter->second; + ++pos; + ParserToken{TokenType::Whitespace}.ignore(pos); + } + + query->command = command_name; + if (!command->parse(pos, query, expected)) + return false; + + ParserToken{TokenType::Whitespace}.ignore(pos); + + node = query; + return true; +} + +} diff --git a/programs/keeper-client/Parser.h b/programs/keeper-client/Parser.h new file mode 100644 index 00000000000..57ee6ce4a18 --- /dev/null +++ b/programs/keeper-client/Parser.h @@ -0,0 +1,36 @@ +#pragma once + +#include +#include +#include +#include +#include + + +namespace DB +{ + +bool parseKeeperArg(IParser::Pos & pos, Expected & expected, String & result); + +bool parseKeeperPath(IParser::Pos & pos, Expected & expected, String & path); + + +class ASTKeeperQuery : public IAST +{ +public: + String getID(char) const override { return "KeeperQuery"; } + ASTPtr clone() const override { return std::make_shared(*this); } + + String command; + std::vector args; +}; + +class KeeperParser : public IParserBase +{ +protected: + const char * getName() const override { return "Keeper client query"; } + + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; +}; + +} diff --git a/src/Parsers/TokenIterator.cpp b/src/Parsers/TokenIterator.cpp index 6633ddb9563..6b798f6f576 100644 --- a/src/Parsers/TokenIterator.cpp +++ b/src/Parsers/TokenIterator.cpp @@ -4,7 +4,7 @@ namespace DB { -Tokens::Tokens(const char * begin, const char * end, size_t max_query_size) +Tokens::Tokens(const char * begin, const char * end, size_t max_query_size, bool skipp_insignificant) { Lexer lexer(begin, end, max_query_size); @@ -13,7 +13,7 @@ Tokens::Tokens(const char * begin, const char * end, size_t max_query_size) { Token token = lexer.nextToken(); stop = token.isEnd() || token.type == TokenType::ErrorMaxQuerySizeExceeded; - if (token.isSignificant()) + if (token.isSignificant() || (!skipp_insignificant && !data.empty() && data.back().isSignificant())) data.emplace_back(std::move(token)); } while (!stop); } diff --git a/src/Parsers/TokenIterator.h b/src/Parsers/TokenIterator.h index c9ac61dfef9..31cb644d879 100644 --- a/src/Parsers/TokenIterator.h +++ b/src/Parsers/TokenIterator.h @@ -24,7 +24,7 @@ private: std::size_t last_accessed_index = 0; public: - Tokens(const char * begin, const char * end, size_t max_query_size = 0); + Tokens(const char * begin, const char * end, size_t max_query_size = 0, bool skipp_insignificant = true); ALWAYS_INLINE inline const Token & operator[](size_t index) { diff --git a/src/Parsers/parseQuery.cpp b/src/Parsers/parseQuery.cpp index 8d794409f78..9f688f204a2 100644 --- a/src/Parsers/parseQuery.cpp +++ b/src/Parsers/parseQuery.cpp @@ -233,10 +233,11 @@ ASTPtr tryParseQuery( const std::string & query_description, bool allow_multi_statements, size_t max_query_size, - size_t max_parser_depth) + size_t max_parser_depth, + bool skipp_insignificant) { const char * query_begin = _out_query_end; - Tokens tokens(query_begin, all_queries_end, max_query_size); + Tokens tokens(query_begin, all_queries_end, max_query_size, skipp_insignificant); /// NOTE: consider use UInt32 for max_parser_depth setting. IParser::Pos token_iterator(tokens, static_cast(max_parser_depth)); diff --git a/src/Parsers/parseQuery.h b/src/Parsers/parseQuery.h index cc077bbdab2..30f43261103 100644 --- a/src/Parsers/parseQuery.h +++ b/src/Parsers/parseQuery.h @@ -18,7 +18,8 @@ ASTPtr tryParseQuery( bool allow_multi_statements, /// If false, check for non-space characters after semicolon and set error message if any. size_t max_query_size, /// If (end - pos) > max_query_size and query is longer than max_query_size then throws "Max query size exceeded". /// Disabled if zero. Is used in order to check query size if buffer can contains data for INSERT query. - size_t max_parser_depth); + size_t max_parser_depth, + bool skipp_insignificant = true); /// If true, lexer will skip all insignificant tokens (e.g. whitespaces) /// Parse query or throw an exception with error message. From a20f8e74a0cbf0b72ee657b3c6c302d16c61952e Mon Sep 17 00:00:00 2001 From: pufit Date: Wed, 26 Apr 2023 01:05:51 -0400 Subject: [PATCH 0026/1072] Fix `cd` command. Style consistency --- programs/keeper-client/Commands.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/programs/keeper-client/Commands.cpp b/programs/keeper-client/Commands.cpp index a21550e969d..7b7c4670b7d 100644 --- a/programs/keeper-client/Commands.cpp +++ b/programs/keeper-client/Commands.cpp @@ -42,7 +42,7 @@ bool CDCommand::parse(IParser::Pos & pos, std::shared_ptr & node void CDCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) const { - if (!query->args.empty()) + if (query->args.empty()) return; auto new_path = client->getAbsolutePath(query->args[0].safeGet()); @@ -158,7 +158,7 @@ bool HelpCommand::parse(IParser::Pos & /* pos */, std::shared_ptrgetHelpMessage() << '\n'; + std::cout << pair.second->getHelpMessage() << "\n"; } bool FourLetterWordCommand::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const From 2d2483d695f39fd8488e3667d77faaaa4177bd92 Mon Sep 17 00:00:00 2001 From: Aleksei Golub Date: Mon, 24 Apr 2023 21:50:40 +0300 Subject: [PATCH 0027/1072] Rename DatabaseFileSystem to DatabaseFilesystem --- src/Databases/DatabaseFactory.cpp | 16 +++++++------- ...eFileSystem.cpp => DatabaseFilesystem.cpp} | 22 +++++++++---------- ...abaseFileSystem.h => DatabaseFilesystem.h} | 8 +++---- src/Databases/DatabasesOverlay.cpp | 4 ++-- ...cal_implicit_file_table_function.reference | 4 ++-- ...ouse_local_implicit_file_table_function.sh | 8 +++---- 6 files changed, 31 insertions(+), 31 deletions(-) rename src/Databases/{DatabaseFileSystem.cpp => DatabaseFilesystem.cpp} (82%) rename src/Databases/{DatabaseFileSystem.h => DatabaseFilesystem.h} (83%) diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index 3356689d892..9950ab5bf45 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -3,11 +3,11 @@ #include #include #include +#include #include #include #include #include -#include #include #include #include @@ -15,10 +15,10 @@ #include #include #include -#include -#include #include +#include #include +#include #include "config.h" @@ -133,13 +133,13 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String static const std::unordered_set database_engines{"Ordinary", "Atomic", "Memory", "Dictionary", "Lazy", "Replicated", "MySQL", "MaterializeMySQL", "MaterializedMySQL", - "PostgreSQL", "MaterializedPostgreSQL", "SQLite", "FileSystem"}; + "PostgreSQL", "MaterializedPostgreSQL", "SQLite", "Filesystem"}; if (!database_engines.contains(engine_name)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Database engine name `{}` does not exist", engine_name); static const std::unordered_set engines_with_arguments{"MySQL", "MaterializeMySQL", "MaterializedMySQL", - "Lazy", "Replicated", "PostgreSQL", "MaterializedPostgreSQL", "SQLite", "FileSystem"}; + "Lazy", "Replicated", "PostgreSQL", "MaterializedPostgreSQL", "SQLite", "Filesystem"}; static const std::unordered_set engines_with_table_overrides{"MaterializeMySQL", "MaterializedMySQL", "MaterializedPostgreSQL"}; bool engine_may_have_arguments = engines_with_arguments.contains(engine_name); @@ -433,7 +433,7 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String return std::make_shared(context, engine_define, create.attach, database_path); } #endif - else if (engine_name == "FileSystem") + else if (engine_name == "Filesystem") { const ASTFunction * engine = engine_define->engine; @@ -443,13 +443,13 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String if (engine->arguments && !engine->arguments->children.empty()) { if (engine->arguments->children.size() != 1) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "FileSystem database requires at most 1 argument: file_system_path"); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Filesystem database requires at most 1 argument: filesystem_path"); const auto & arguments = engine->arguments->children; init_path = safeGetLiteralValue(arguments[0], engine_name); } - return std::make_shared(database_name, init_path, context); + return std::make_shared(database_name, init_path, context); } throw Exception(ErrorCodes::UNKNOWN_DATABASE_ENGINE, "Unknown database engine: {}", engine_name); diff --git a/src/Databases/DatabaseFileSystem.cpp b/src/Databases/DatabaseFilesystem.cpp similarity index 82% rename from src/Databases/DatabaseFileSystem.cpp rename to src/Databases/DatabaseFilesystem.cpp index 8b92ad8080a..177b4717716 100644 --- a/src/Databases/DatabaseFileSystem.cpp +++ b/src/Databases/DatabaseFilesystem.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include @@ -17,25 +17,25 @@ namespace DB { -DatabaseFileSystem::DatabaseFileSystem(const String & name_, const String & path_, ContextPtr context_) +DatabaseFilesystem::DatabaseFilesystem(const String & name_, const String & path_, ContextPtr context_) : IDatabase(name_), WithContext(context_->getGlobalContext()), path(path_), log(&Poco::Logger::get("DatabaseFileSystem(" + name_ + ")")) { if (path.empty()) path = Poco::Path::current(); } -std::string DatabaseFileSystem::getTablePath(const std::string& table_name) const +std::string DatabaseFilesystem::getTablePath(const std::string& table_name) const { return Poco::Path(path, table_name).toString(); } -void DatabaseFileSystem::addTable(const std::string& table_name, StoragePtr table_storage) const +void DatabaseFilesystem::addTable(const std::string& table_name, StoragePtr table_storage) const { std::lock_guard lock(mutex); loaded_tables.emplace(table_name, table_storage); } -bool DatabaseFileSystem::isTableExist(const String & name, ContextPtr) const +bool DatabaseFilesystem::isTableExist(const String & name, ContextPtr) const { { std::lock_guard lock(mutex); @@ -47,7 +47,7 @@ bool DatabaseFileSystem::isTableExist(const String & name, ContextPtr) const return table_file.exists() && table_file.isFile(); } -StoragePtr DatabaseFileSystem::tryGetTable(const String & name, ContextPtr context_) const +StoragePtr DatabaseFilesystem::tryGetTable(const String & name, ContextPtr context_) const { // Check if the table exists in the loaded tables map { @@ -85,12 +85,12 @@ StoragePtr DatabaseFileSystem::tryGetTable(const String & name, ContextPtr conte } } -ASTPtr DatabaseFileSystem::getCreateDatabaseQuery() const +ASTPtr DatabaseFilesystem::getCreateDatabaseQuery() const { auto settings = getContext()->getSettingsRef(); ParserCreateQuery parser; - String query = "CREATE DATABASE " + backQuoteIfNeed(getDatabaseName()) + " ENGINE = FileSystem(" + backQuoteIfNeed(path) + ")"; + const String query = fmt::format("CREATE DATABASE {} ENGINE = Filesystem({})", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(path)); ASTPtr ast = parseQuery(parser, query.data(), query.data() + query.size(), "", 0, settings.max_parser_depth); if (const auto database_comment = getDatabaseComment(); !database_comment.empty()) @@ -102,7 +102,7 @@ ASTPtr DatabaseFileSystem::getCreateDatabaseQuery() const return ast; } -void DatabaseFileSystem::shutdown() +void DatabaseFilesystem::shutdown() { Tables tables_snapshot; { @@ -123,7 +123,7 @@ void DatabaseFileSystem::shutdown() /** * Returns an empty vector because the database is read-only and no tables can be backed up. */ -std::vector> DatabaseFileSystem::getTablesForBackup(const FilterByNameFunction&, const ContextPtr&) const +std::vector> DatabaseFilesystem::getTablesForBackup(const FilterByNameFunction&, const ContextPtr&) const { return {}; } @@ -133,7 +133,7 @@ std::vector> DatabaseFileSystem::getTablesForBacku * Returns an empty iterator because the database does not have its own tables * But only caches them for quick access. */ -DatabaseTablesIteratorPtr DatabaseFileSystem::getTablesIterator(ContextPtr, const FilterByNameFunction&) const +DatabaseTablesIteratorPtr DatabaseFilesystem::getTablesIterator(ContextPtr, const FilterByNameFunction&) const { return std::make_unique(Tables{}, getDatabaseName()); } diff --git a/src/Databases/DatabaseFileSystem.h b/src/Databases/DatabaseFilesystem.h similarity index 83% rename from src/Databases/DatabaseFileSystem.h rename to src/Databases/DatabaseFilesystem.h index 474a7e78335..d5fdd528aa5 100644 --- a/src/Databases/DatabaseFileSystem.h +++ b/src/Databases/DatabaseFilesystem.h @@ -12,18 +12,18 @@ namespace DB class Context; /** - * DatabaseFileSystem allows to interact with files stored on the file system + * DatabaseFilesystem allows to interact with files stored on the file system * Uses TableFunctionFile to implicitly load file when a user requests the table, and provides read-only access to the data in the file * Tables are cached inside the database for quick access * * Used in clickhouse-local to access local files */ -class DatabaseFileSystem : public IDatabase, protected WithContext +class DatabaseFilesystem : public IDatabase, protected WithContext { public: - DatabaseFileSystem(const String & name, const String & path, ContextPtr context); + DatabaseFilesystem(const String & name, const String & path, ContextPtr context); - String getEngineName() const override { return "FileSystem"; } + String getEngineName() const override { return "Filesystem"; } bool isTableExist(const String & name, ContextPtr context) const override; diff --git a/src/Databases/DatabasesOverlay.cpp b/src/Databases/DatabasesOverlay.cpp index da26f9282a0..3563fa715a6 100644 --- a/src/Databases/DatabasesOverlay.cpp +++ b/src/Databases/DatabasesOverlay.cpp @@ -4,8 +4,8 @@ #include #include +#include #include -#include #include @@ -258,7 +258,7 @@ DatabaseTablesIteratorPtr DatabasesOverlay::getTablesIterator(ContextPtr context DatabasePtr CreateClickHouseLocalDatabaseOverlay(const String & name_, ContextPtr context_) { auto databaseCombiner = std::make_shared(name_, context_); - databaseCombiner->registerNextDatabase(std::make_shared(name_, "", context_)); + databaseCombiner->registerNextDatabase(std::make_shared(name_, "", context_)); databaseCombiner->registerNextDatabase(std::make_shared(name_, context_)); return databaseCombiner; } diff --git a/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.reference b/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.reference index 0fcd843e737..ccc02ad4f34 100644 --- a/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.reference +++ b/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.reference @@ -3,7 +3,7 @@ explicit: 4 implicit: 4 -Test 2: check FileSystem database +Test 2: check Filesystem database 4 -Test 3: check show database with FileSystem +Test 3: check show database with Filesystem test02707 diff --git a/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.sh b/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.sh index 24de0ad579c..7c9095b3d8b 100755 --- a/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.sh +++ b/tests/queries/0_stateless/02707_clickhouse_local_implicit_file_table_function.sh @@ -24,19 +24,19 @@ echo "implicit:" $CLICKHOUSE_LOCAL -q "SELECT COUNT(*) FROM \"${dir}/tmp.csv\"" ################# -echo "Test 2: check FileSystem database" +echo "Test 2: check Filesystem database" $CLICKHOUSE_LOCAL --multiline --multiquery -q """ DROP DATABASE IF EXISTS test; -CREATE DATABASE test ENGINE = FileSystem('${dir}'); +CREATE DATABASE test ENGINE = Filesystem('${dir}'); SELECT COUNT(*) FROM test.\`tmp.csv\`; DROP DATABASE test; """ ################# -echo "Test 3: check show database with FileSystem" +echo "Test 3: check show database with Filesystem" $CLICKHOUSE_LOCAL --multiline --multiquery -q """ DROP DATABASE IF EXISTS test02707; -CREATE DATABASE test02707 ENGINE = FileSystem('${dir}'); +CREATE DATABASE test02707 ENGINE = Filesystem('${dir}'); SHOW DATABASES; DROP DATABASE test02707; """ | grep "test02707" From 79ca39d920fbc52e92f6bbc9496bde2cc1afec42 Mon Sep 17 00:00:00 2001 From: Aleksei Golub Date: Mon, 24 Apr 2023 22:26:16 +0300 Subject: [PATCH 0028/1072] Fixed exception messages --- src/Databases/DatabasesOverlay.cpp | 42 +++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/src/Databases/DatabasesOverlay.cpp b/src/Databases/DatabasesOverlay.cpp index 3563fa715a6..c3af6d9305e 100644 --- a/src/Databases/DatabasesOverlay.cpp +++ b/src/Databases/DatabasesOverlay.cpp @@ -65,7 +65,12 @@ void DatabasesOverlay::createTable(ContextPtr context_, const String & table_nam continue; } } - throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no databases for CREATE TABLE {} query in Database{}", table_name, getEngineName()); + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "There is no databases for CREATE TABLE `{}` query in database `{}` (engine {})", + table_name, + getDatabaseName(), + getEngineName()); } void DatabasesOverlay::dropTable(ContextPtr context_, const String & table_name, bool sync) @@ -82,7 +87,12 @@ void DatabasesOverlay::dropTable(ContextPtr context_, const String & table_name, continue; } } - throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no databases for DROP TABLE {} query in Database{}", table_name, getEngineName()); + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "There is no databases for DROP TABLE `{}` query in database `{}` (engine {})", + table_name, + getDatabaseName(), + getEngineName()); } void DatabasesOverlay::attachTable( @@ -100,7 +110,12 @@ void DatabasesOverlay::attachTable( continue; } } - throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no databases for ATTACH TABLE query in Database{}", getEngineName()); + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "There is no databases for ATTACH TABLE `{}` query in database `{}` (engine {})", + table_name, + getDatabaseName(), + getEngineName()); } StoragePtr DatabasesOverlay::detachTable(ContextPtr context_, const String & table_name) @@ -119,7 +134,12 @@ StoragePtr DatabasesOverlay::detachTable(ContextPtr context_, const String & tab continue; } } - throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no databases for DETACH TABLE {} query in Database{}", table_name, getEngineName()); + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "There is no databases for DETACH TABLE `{}` query in database `{}` (engine {})", + table_name, + getDatabaseName(), + getEngineName()); } ASTPtr DatabasesOverlay::getCreateTableQueryImpl(const String & name, ContextPtr context_, bool throw_on_error) const @@ -132,7 +152,12 @@ ASTPtr DatabasesOverlay::getCreateTableQueryImpl(const String & name, ContextPtr break; } if (!result && throw_on_error) - throw Exception(ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY, "There is no metadata of table {} in Database{}", name, getEngineName()); + throw Exception( + ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY, + "There is no metadata of table `{}` in database `{}` (engine {})", + name, + getDatabaseName(), + getEngineName()); return result; } @@ -201,7 +226,12 @@ void DatabasesOverlay::alterTable(ContextPtr local_context, const StorageID & ta continue; } } - throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no databases for alterTable in Database{}", getEngineName()); + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "There is no databases for ALTER TABLE `{}` query in database `{}` (engine {})", + table_id.table_name, + getDatabaseName(), + getEngineName()); } std::vector> From c9f8dd8bfd3d4123a0a7111f19d8863b19729d9a Mon Sep 17 00:00:00 2001 From: Aleksei Golub Date: Mon, 24 Apr 2023 22:53:32 +0300 Subject: [PATCH 0029/1072] Replaced Poco::File with std::filesystem --- src/Databases/DatabaseFactory.cpp | 2 +- src/Databases/DatabaseFilesystem.cpp | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index 9950ab5bf45..8a50c31efc8 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -437,7 +437,7 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String { const ASTFunction * engine = engine_define->engine; - // If init_path is empty, then the current path from Poco will be used + /// If init_path is empty, then the current path will be used std::string init_path; if (engine->arguments && !engine->arguments->children.empty()) diff --git a/src/Databases/DatabaseFilesystem.cpp b/src/Databases/DatabaseFilesystem.cpp index 177b4717716..1decb273ae1 100644 --- a/src/Databases/DatabaseFilesystem.cpp +++ b/src/Databases/DatabaseFilesystem.cpp @@ -8,11 +8,12 @@ #include #include #include -#include -#include #include #include +#include + +namespace fs = std::filesystem; namespace DB { @@ -21,12 +22,12 @@ DatabaseFilesystem::DatabaseFilesystem(const String & name_, const String & path : IDatabase(name_), WithContext(context_->getGlobalContext()), path(path_), log(&Poco::Logger::get("DatabaseFileSystem(" + name_ + ")")) { if (path.empty()) - path = Poco::Path::current(); + path = fs::current_path(); } std::string DatabaseFilesystem::getTablePath(const std::string& table_name) const { - return Poco::Path(path, table_name).toString(); + return fs::path(path) / table_name; } void DatabaseFilesystem::addTable(const std::string& table_name, StoragePtr table_storage) const @@ -43,8 +44,8 @@ bool DatabaseFilesystem::isTableExist(const String & name, ContextPtr) const return true; } - Poco::File table_file(getTablePath(name)); - return table_file.exists() && table_file.isFile(); + fs::path table_file_path(getTablePath(name)); + return fs::exists(table_file_path) && fs::is_regular_file(table_file_path); } StoragePtr DatabaseFilesystem::tryGetTable(const String & name, ContextPtr context_) const @@ -62,8 +63,7 @@ StoragePtr DatabaseFilesystem::tryGetTable(const String & name, ContextPtr conte try { // If the table doesn't exist in the tables map, check if the corresponding file exists - Poco::File table_file(table_path); - if (!table_file.exists()) + if (!fs::exists(table_path) || !fs::is_regular_file(table_path)) return nullptr; // If the file exists, create a new table using TableFunctionFile and return it. From 26812f36fb73ca8a3f1c16a0db54dd4327f7dc6c Mon Sep 17 00:00:00 2001 From: Aleksei Golub Date: Wed, 26 Apr 2023 01:13:29 +0300 Subject: [PATCH 0030/1072] Added read-only database setting; Fixed error messages for filesystem database; added tests --- src/Databases/DatabaseFilesystem.cpp | 46 ++++++++------- src/Databases/DatabaseFilesystem.h | 8 +++ src/Databases/DatabasesOverlay.cpp | 13 ++--- src/Databases/IDatabase.h | 4 +- src/Interpreters/DatabaseCatalog.cpp | 14 ++++- .../02722_database_filesystem.reference | 12 ++++ .../0_stateless/02722_database_filesystem.sh | 58 +++++++++++++++++++ 7 files changed, 124 insertions(+), 31 deletions(-) create mode 100644 tests/queries/0_stateless/02722_database_filesystem.reference create mode 100755 tests/queries/0_stateless/02722_database_filesystem.sh diff --git a/src/Databases/DatabaseFilesystem.cpp b/src/Databases/DatabaseFilesystem.cpp index 1decb273ae1..106885e7c3e 100644 --- a/src/Databases/DatabaseFilesystem.cpp +++ b/src/Databases/DatabaseFilesystem.cpp @@ -21,8 +21,7 @@ namespace DB DatabaseFilesystem::DatabaseFilesystem(const String & name_, const String & path_, ContextPtr context_) : IDatabase(name_), WithContext(context_->getGlobalContext()), path(path_), log(&Poco::Logger::get("DatabaseFileSystem(" + name_ + ")")) { - if (path.empty()) - path = fs::current_path(); + path = fs::path(path).lexically_normal().string(); } std::string DatabaseFilesystem::getTablePath(const std::string& table_name) const @@ -48,7 +47,7 @@ bool DatabaseFilesystem::isTableExist(const String & name, ContextPtr) const return fs::exists(table_file_path) && fs::is_regular_file(table_file_path); } -StoragePtr DatabaseFilesystem::tryGetTable(const String & name, ContextPtr context_) const +StoragePtr DatabaseFilesystem::getTableImpl(const String & name, ContextPtr context_) const { // Check if the table exists in the loaded tables map { @@ -60,24 +59,31 @@ StoragePtr DatabaseFilesystem::tryGetTable(const String & name, ContextPtr conte auto table_path = getTablePath(name); + // If the file exists, create a new table using TableFunctionFile and return it. + auto args = makeASTFunction("file", std::make_shared(table_path)); + + auto table_function = TableFunctionFactory::instance().get(args, context_); + if (!table_function) + return nullptr; + + auto table_storage = table_function->execute(args, context_, name); + if (table_storage) + addTable(name, table_storage); + + return table_storage; +} + +StoragePtr DatabaseFilesystem::getTable(const String & name, ContextPtr context_) const +{ + if (auto storage = getTableImpl(name, context_)) + return storage; + throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} doesn't exist", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name)); +} + +StoragePtr DatabaseFilesystem::tryGetTable(const String & name, ContextPtr context_) const { try { - // If the table doesn't exist in the tables map, check if the corresponding file exists - if (!fs::exists(table_path) || !fs::is_regular_file(table_path)) - return nullptr; - - // If the file exists, create a new table using TableFunctionFile and return it. - auto args = makeASTFunction("file", std::make_shared(table_path)); - - auto table_function = TableFunctionFactory::instance().get(args, context_); - if (!table_function) - return nullptr; - - auto table_storage = table_function->execute(args, context_, name); - if (table_storage) - addTable(name, table_storage); - - return table_storage; + return getTable(name, context_); } catch (...) { @@ -90,7 +96,7 @@ ASTPtr DatabaseFilesystem::getCreateDatabaseQuery() const auto settings = getContext()->getSettingsRef(); ParserCreateQuery parser; - const String query = fmt::format("CREATE DATABASE {} ENGINE = Filesystem({})", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(path)); + const String query = fmt::format("CREATE DATABASE {} ENGINE = Filesystem('{}')", backQuoteIfNeed(getDatabaseName()), path); ASTPtr ast = parseQuery(parser, query.data(), query.data() + query.size(), "", 0, settings.max_parser_depth); if (const auto database_comment = getDatabaseComment(); !database_comment.empty()) diff --git a/src/Databases/DatabaseFilesystem.h b/src/Databases/DatabaseFilesystem.h index d5fdd528aa5..697511ac5b3 100644 --- a/src/Databases/DatabaseFilesystem.h +++ b/src/Databases/DatabaseFilesystem.h @@ -27,10 +27,14 @@ public: bool isTableExist(const String & name, ContextPtr context) const override; + StoragePtr getTable(const String & name, ContextPtr context) const override; + StoragePtr tryGetTable(const String & name, ContextPtr context) const override; bool empty() const override { return true; } + bool isReadOnly() const override { return true; } + ASTPtr getCreateDatabaseQuery() const override; void shutdown() override; @@ -39,9 +43,13 @@ public: DatabaseTablesIteratorPtr getTablesIterator(ContextPtr, const FilterByNameFunction &) const override; protected: + StoragePtr getTableImpl(const String & name, ContextPtr context) const; + std::string getTablePath(const std::string & table_name) const; + void addTable(const std::string & table_name, StoragePtr table_storage) const; + private: String path; mutable Tables loaded_tables TSA_GUARDED_BY(mutex); diff --git a/src/Databases/DatabasesOverlay.cpp b/src/Databases/DatabasesOverlay.cpp index c3af6d9305e..5a6a4fe5cc6 100644 --- a/src/Databases/DatabasesOverlay.cpp +++ b/src/Databases/DatabasesOverlay.cpp @@ -55,15 +55,11 @@ void DatabasesOverlay::createTable(ContextPtr context_, const String & table_nam { for (auto & db : databases) { - try + if (!db->isReadOnly()) { db->createTable(context_, table_name, table, query); return; } - catch (...) - { - continue; - } } throw Exception( ErrorCodes::LOGICAL_ERROR, @@ -218,8 +214,11 @@ void DatabasesOverlay::alterTable(ContextPtr local_context, const StorageID & ta { try { - db->alterTable(local_context, table_id, metadata); - return; + if (!db->isReadOnly()) + { + db->alterTable(local_context, table_id, metadata); + return; + } } catch (...) { diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 53a2f372814..6508e2ce060 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -170,7 +170,7 @@ public: /// Get the table for work. Return nullptr if there is no table. virtual StoragePtr tryGetTable(const String & name, ContextPtr context) const = 0; - StoragePtr getTable(const String & name, ContextPtr context) const; + virtual StoragePtr getTable(const String & name, ContextPtr context) const; virtual UUID tryGetTableUUID(const String & /*table_name*/) const { return UUIDHelpers::Nil; } @@ -183,6 +183,8 @@ public: /// Is the database empty. virtual bool empty() const = 0; + virtual bool isReadOnly() const { return false; } + /// Add the table to the database. Record its presence in the metadata. virtual void createTable( ContextPtr /*context*/, diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index 8d3fa91a7fe..f9e74fadcbd 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -338,9 +338,17 @@ DatabaseAndTable DatabaseCatalog::getTableImpl( database = it->second; } - auto table = database->tryGetTable(table_id.table_name, context_); - if (!table && exception) - exception->emplace(Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} doesn't exist", table_id.getNameForLogs())); + StoragePtr table = nullptr; + try + { + table = database->getTable(table_id.table_name, context_); + } + catch (const Exception & e) + { + if (exception) + exception->emplace(*e.clone()); + } + if (!table) database = nullptr; diff --git a/tests/queries/0_stateless/02722_database_filesystem.reference b/tests/queries/0_stateless/02722_database_filesystem.reference new file mode 100644 index 00000000000..a583f1e2e3c --- /dev/null +++ b/tests/queries/0_stateless/02722_database_filesystem.reference @@ -0,0 +1,12 @@ +Test 1: create filesystem database and check implicit calls +0 +test1 +4 +4 +4 +Test 2: check DatabaseFilesystem access rights on server +OK +OK +OK +OK +OK diff --git a/tests/queries/0_stateless/02722_database_filesystem.sh b/tests/queries/0_stateless/02722_database_filesystem.sh new file mode 100755 index 00000000000..0adeface438 --- /dev/null +++ b/tests/queries/0_stateless/02722_database_filesystem.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# see 01658_read_file_to_stringcolumn.sh +CLICKHOUSE_USER_FILES_PATH=$(clickhouse-client --query "select _path, _file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') + +# Prepare data +mkdir -p ${CLICKHOUSE_USER_FILES_PATH}/tmp/ +echo '"id","str","int","text"' > ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv +echo '1,"abc",123,"abacaba"' >> ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv +echo '2,"def",456,"bacabaa"' >> ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv +echo '3,"story",78912,"acabaab"' >> ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv +echo '4,"history",21321321,"cabaaba"' >> ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv + +tmp_dir=${CLICKHOUSE_TEST_UNIQUE_NAME} +[[ -d $tmp_dir ]] && rm -rd $tmp_dir +mkdir $tmp_dir +cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${tmp_dir}/tmp.csv +cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp/tmp.csv + +################# +echo "Test 1: create filesystem database and check implicit calls" +${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test1; +CREATE DATABASE test1 ENGINE = Filesystem; +""" +echo $? +${CLICKHOUSE_CLIENT} --query "SHOW DATABASES" | grep "test1" +${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`tmp.csv\`;" +${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`tmp/tmp.csv\`;" +${CLICKHOUSE_LOCAL} -q "SELECT COUNT(*) FROM \"${tmp_dir}/tmp.csv\"" + +################# +echo "Test 2: check DatabaseFilesystem access rights on server" +# Allows list files only inside user_files +${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`../tmp.csv\`;" 2>&1| grep -F "Code: 291" > /dev/null && echo "OK" +${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`/tmp/tmp.csv\`;" 2>&1| grep -F "Code: 291" > /dev/null && echo "OK" + +${CLICKHOUSE_CLIENT} --multiline --multiquery --query """ +USE test1; +SELECT COUNT(*) FROM \"../${tmp_dir}/tmp.csv\"; +""" 2>&1| grep -F "Code: 291" > /dev/null && echo "OK" +${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`../../../../../../tmp.csv\`;" 2>&1| grep -F "Code: 291" > /dev/null && echo "OK" +${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test2; +CREATE DATABASE test2 ENGINE = Filesystem('/tmp'); +SELECT COUNT(*) FROM test2.\`tmp.csv\`; +""" 2>&1| grep -F "Code: 291" > /dev/null && echo "OK" + +# Clean +${CLICKHOUSE_CLIENT} --query "DROP DATABASE test1;" +${CLICKHOUSE_CLIENT} --query "DROP DATABASE test2;" +rm -rd $tmp_dir +rm -rd $CLICKHOUSE_USER_FILES_PATH From 4606e660683992b630f9db952beda9b261f82d76 Mon Sep 17 00:00:00 2001 From: Aleksei Golub Date: Wed, 26 Apr 2023 11:06:01 +0300 Subject: [PATCH 0031/1072] Fix style --- src/Databases/DatabaseFilesystem.cpp | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/Databases/DatabaseFilesystem.cpp b/src/Databases/DatabaseFilesystem.cpp index 106885e7c3e..16aed185669 100644 --- a/src/Databases/DatabaseFilesystem.cpp +++ b/src/Databases/DatabaseFilesystem.cpp @@ -18,18 +18,23 @@ namespace fs = std::filesystem; namespace DB { +namespace ErrorCodes +{ + extern const int UNKNOWN_TABLE; +} + DatabaseFilesystem::DatabaseFilesystem(const String & name_, const String & path_, ContextPtr context_) : IDatabase(name_), WithContext(context_->getGlobalContext()), path(path_), log(&Poco::Logger::get("DatabaseFileSystem(" + name_ + ")")) { path = fs::path(path).lexically_normal().string(); } -std::string DatabaseFilesystem::getTablePath(const std::string& table_name) const +std::string DatabaseFilesystem::getTablePath(const std::string & table_name) const { return fs::path(path) / table_name; } -void DatabaseFilesystem::addTable(const std::string& table_name, StoragePtr table_storage) const +void DatabaseFilesystem::addTable(const std::string & table_name, StoragePtr table_storage) const { std::lock_guard lock(mutex); loaded_tables.emplace(table_name, table_storage); @@ -80,7 +85,8 @@ StoragePtr DatabaseFilesystem::getTable(const String & name, ContextPtr context_ throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} doesn't exist", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name)); } -StoragePtr DatabaseFilesystem::tryGetTable(const String & name, ContextPtr context_) const { +StoragePtr DatabaseFilesystem::tryGetTable(const String & name, ContextPtr context_) const +{ try { return getTable(name, context_); @@ -127,9 +133,9 @@ void DatabaseFilesystem::shutdown() } /** - * Returns an empty vector because the database is read-only and no tables can be backed up. + * Returns an empty vector because the database is read-only and no tables can be backed up */ -std::vector> DatabaseFilesystem::getTablesForBackup(const FilterByNameFunction&, const ContextPtr&) const +std::vector> DatabaseFilesystem::getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const { return {}; } @@ -137,9 +143,9 @@ std::vector> DatabaseFilesystem::getTablesForBacku /** * * Returns an empty iterator because the database does not have its own tables - * But only caches them for quick access. + * But only caches them for quick access */ -DatabaseTablesIteratorPtr DatabaseFilesystem::getTablesIterator(ContextPtr, const FilterByNameFunction&) const +DatabaseTablesIteratorPtr DatabaseFilesystem::getTablesIterator(ContextPtr, const FilterByNameFunction &) const { return std::make_unique(Tables{}, getDatabaseName()); } From ca1501aeb4e9c7a1db131f4c24255bd24bd99059 Mon Sep 17 00:00:00 2001 From: Aleksei Golub Date: Wed, 26 Apr 2023 13:05:56 +0300 Subject: [PATCH 0032/1072] retrigger checks From 1f90e9bde8ab740ae5fda958ca93f9c4abab6008 Mon Sep 17 00:00:00 2001 From: Aleksei Golub Date: Wed, 26 Apr 2023 14:37:41 +0300 Subject: [PATCH 0033/1072] retrigger checks From 2426c445b0f17a0c98be86463efda8bd552d18de Mon Sep 17 00:00:00 2001 From: pufit Date: Wed, 26 Apr 2023 20:10:32 -0400 Subject: [PATCH 0034/1072] Improve `set` command --- programs/keeper-client/Commands.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/programs/keeper-client/Commands.cpp b/programs/keeper-client/Commands.cpp index 7b7c4670b7d..05928a0d20b 100644 --- a/programs/keeper-client/Commands.cpp +++ b/programs/keeper-client/Commands.cpp @@ -24,8 +24,7 @@ void LSCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) con else path = client->cwd; - const auto children = client->zookeeper->getChildren(path); - for (const auto & child : children) + for (const auto & child : client->zookeeper->getChildren(path)) std::cout << child << " "; std::cout << "\n"; } @@ -92,6 +91,19 @@ bool CreateCommand::parse(IParser::Pos & pos, std::shared_ptr & return false; node->args.push_back(std::move(arg)); + int mode = zkutil::CreateMode::Persistent; + + if (ParserKeyword{"PERSISTENT"}.ignore(pos, expected)) + mode = zkutil::CreateMode::Persistent; + else if (ParserKeyword{"EPHEMERAL"}.ignore(pos, expected)) + mode = zkutil::CreateMode::Ephemeral; + else if (ParserKeyword{"EPHEMERAL SEQUENTIAL"}.ignore(pos, expected)) + mode = zkutil::CreateMode::EphemeralSequential; + else if (ParserKeyword{"PERSISTENT SEQUENTIAL"}.ignore(pos, expected)) + mode = zkutil::CreateMode::PersistentSequential; + + node->args.push_back(mode); + return true; } @@ -100,7 +112,7 @@ void CreateCommand::execute(const ASTKeeperQuery * query, KeeperClient * client) client->zookeeper->create( client->getAbsolutePath(query->args[0].safeGet()), query->args[1].safeGet(), - zkutil::CreateMode::Persistent); + static_cast(query->args[2].safeGet())); } bool GetCommand::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const From 9204e2e3de85563ecc8d24ac356e608f95880c6b Mon Sep 17 00:00:00 2001 From: pufit Date: Wed, 26 Apr 2023 20:11:45 -0400 Subject: [PATCH 0035/1072] Better completion --- programs/keeper-client/KeeperClient.cpp | 56 +++++++++++++++++++++---- programs/keeper-client/KeeperClient.h | 8 +++- src/Client/LineReader.cpp | 21 ++++++---- src/Client/LineReader.h | 5 +++ 4 files changed, 75 insertions(+), 15 deletions(-) diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index 92aa822231d..0ea4c4dde28 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -10,7 +10,6 @@ namespace po = boost::program_options; -namespace fs = std::filesystem; namespace DB { @@ -42,6 +41,48 @@ String KeeperClient::executeFourLetterCommand(const String & command) return result; } +std::vector KeeperClient::getCompletions(const String & prefix) const +{ + Tokens tokens(prefix.data(), prefix.data() + prefix.size(), 0, false); + IParser::Pos pos(tokens, 0); + + if (pos->type != TokenType::BareWord) + return registered_commands_and_four_letter_words; + + ++pos; + if (pos->isEnd()) + return registered_commands_and_four_letter_words; + + ParserToken{TokenType::Whitespace}.ignore(pos); + + std::vector result; + String string_path; + Expected expected; + if (!parseKeeperPath(pos, expected, string_path)) + string_path = cwd; + + if (!pos->isEnd()) + return result; + + fs::path path = string_path; + String parent_path; + if (string_path.ends_with("/")) + parent_path = getAbsolutePath(string_path); + else + parent_path = getAbsolutePath(path.parent_path()); + + try + { + for (const auto & child : zookeeper->getChildren(parent_path)) + result.push_back(child); + } + catch (Coordination::Exception &) {} + + std::sort(result.begin(), result.end()); + + return result; +} + void KeeperClient::askConfirmation(const String & prompt, std::function && callback) { std::cout << prompt << " Continue?\n"; @@ -49,7 +90,7 @@ void KeeperClient::askConfirmation(const String & prompt, std::function confirmation_callback = callback; } -String KeeperClient::getAbsolutePath(const String & relative) const +fs::path KeeperClient::getAbsolutePath(const String & relative) const { String result; if (relative.starts_with('/')) @@ -65,18 +106,17 @@ String KeeperClient::getAbsolutePath(const String & relative) const void KeeperClient::loadCommands(std::vector && new_commands) { - std::vector suggestions; for (const auto & command : new_commands) { String name = command->getName(); commands.insert({name, command}); - suggestions.push_back(std::move(name)); + registered_commands_and_four_letter_words.push_back(std::move(name)); } for (const auto & command : four_letter_word_commands) - suggestions.push_back(command); + registered_commands_and_four_letter_words.push_back(command); - suggest.addWords(std::move(suggestions)); + std::sort(registered_commands_and_four_letter_words.begin(), registered_commands_and_four_letter_words.end()); } void KeeperClient::defineOptions(Poco::Util::OptionSet & options) @@ -130,6 +170,9 @@ void KeeperClient::defineOptions(Poco::Util::OptionSet & options) void KeeperClient::initialize(Poco::Util::Application & /* self */) { + suggest.setCompletionsCallback( + [&](const String & prefix, size_t /* prefix_length */) { return getCompletions(prefix); }); + loadCommands({ std::make_shared(), std::make_shared(), @@ -248,7 +291,6 @@ void KeeperClient::runInteractive() int KeeperClient::main(const std::vector & /* args */) { - auto host = config().getString("host", "localhost"); auto port = config().getString("port", "2181"); zk_args.hosts = {host + ":" + port}; diff --git a/programs/keeper-client/KeeperClient.h b/programs/keeper-client/KeeperClient.h index 0297491bd28..e7fa5cdc5fe 100644 --- a/programs/keeper-client/KeeperClient.h +++ b/programs/keeper-client/KeeperClient.h @@ -12,6 +12,8 @@ #include +namespace fs = std::filesystem; + namespace DB { @@ -33,7 +35,7 @@ public: void defineOptions(Poco::Util::OptionSet & options) override; - String getAbsolutePath(const String & relative) const; + fs::path getAbsolutePath(const String & relative) const; void askConfirmation(const String & prompt, std::function && callback); @@ -52,12 +54,16 @@ protected: void loadCommands(std::vector && new_commands); + std::vector getCompletions(const String & prefix) const; + String history_file; LineReader::Suggest suggest; zkutil::ZooKeeperArgs zk_args; bool need_confirmation = false; + + std::vector registered_commands_and_four_letter_words; }; } diff --git a/src/Client/LineReader.cpp b/src/Client/LineReader.cpp index 04b387c9f7d..f6cd7bffef7 100644 --- a/src/Client/LineReader.cpp +++ b/src/Client/LineReader.cpp @@ -81,17 +81,24 @@ replxx::Replxx::completions_t LineReader::Suggest::getCompletions(const String & std::lock_guard lock(mutex); + Words to_search; /// Only perform case sensitive completion when the prefix string contains any uppercase characters if (std::none_of(prefix.begin(), prefix.end(), [](char32_t x) { return iswupper(static_cast(x)); })) - range = std::equal_range( - words_no_case.begin(), words_no_case.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched) - { - return strncasecmp(s.data(), prefix_searched.data(), prefix_length) < 0; - }); + to_search = words_no_case; else - range = std::equal_range(words.begin(), words.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched) + to_search = words; + + if (custom_completions_callback) + { + auto new_words = custom_completions_callback(prefix, prefix_length); + assert(std::is_sorted(new_words.begin(), new_words.end())); + addNewWords(to_search, new_words, std::less{}); + } + + range = std::equal_range( + to_search.begin(), to_search.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched) { - return strncmp(s.data(), prefix_searched.data(), prefix_length) < 0; + return strncasecmp(s.data(), prefix_searched.data(), prefix_length) < 0; }); return replxx::Replxx::completions_t(range.first, range.second); diff --git a/src/Client/LineReader.h b/src/Client/LineReader.h index 321cf41b77e..7d053df1458 100644 --- a/src/Client/LineReader.h +++ b/src/Client/LineReader.h @@ -18,15 +18,20 @@ public: struct Suggest { using Words = std::vector; + using Callback = std::function; /// Get vector for the matched range of words if any. replxx::Replxx::completions_t getCompletions(const String & prefix, size_t prefix_length); void addWords(Words && new_words); + void setCompletionsCallback(Callback && callback) { custom_completions_callback = callback; } + private: Words words TSA_GUARDED_BY(mutex); Words words_no_case TSA_GUARDED_BY(mutex); + Callback custom_completions_callback = nullptr; + std::mutex mutex; }; From d235fdd5722c724fc7824ae0f7336c053b3d7084 Mon Sep 17 00:00:00 2001 From: pufit Date: Wed, 26 Apr 2023 20:39:33 -0400 Subject: [PATCH 0036/1072] Produce help message --- programs/keeper-client/KeeperClient.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index 0ea4c4dde28..f38da1b72aa 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -6,11 +6,9 @@ #include #include #include -#include +#include -namespace po = boost::program_options; - namespace DB { @@ -291,6 +289,15 @@ void KeeperClient::runInteractive() int KeeperClient::main(const std::vector & /* args */) { + if (config().hasOption("help")) + { + Poco::Util::HelpFormatter help_formatter(KeeperClient::options()); + auto header_str = fmt::format("{} [OPTION]\n", commandName()); + help_formatter.setHeader(header_str); + help_formatter.format(std::cout); + return 0; + } + auto host = config().getString("host", "localhost"); auto port = config().getString("port", "2181"); zk_args.hosts = {host + ":" + port}; From de03b905709bb6da3196b1b76fcbe60c14652a6e Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Thu, 27 Apr 2023 17:07:00 +0000 Subject: [PATCH 0037/1072] impl --- src/Core/Settings.h | 2 + ...chronousReadIndirectBufferFromRemoteFS.cpp | 26 +++++---- ...ynchronousReadIndirectBufferFromRemoteFS.h | 9 ++-- .../IO/CachedOnDiskReadBufferFromFile.cpp | 19 ++++++- src/Disks/IO/ThreadPoolRemoteFSReader.cpp | 2 +- src/Interpreters/Cache/FileCache.cpp | 38 ++++++++++--- src/Interpreters/Cache/FileCache.h | 4 +- src/Interpreters/Cache/FileCacheSettings.cpp | 2 + src/Interpreters/Cache/FileCacheSettings.h | 3 ++ src/Interpreters/Cache/FileCache_fwd.h | 2 +- src/Interpreters/Cache/FileSegment.cpp | 2 + .../FilesystemReadPrefetchesLog.cpp | 8 +-- .../FilesystemReadPrefetchesLog.h | 2 +- src/Interpreters/executeQuery.cpp | 2 + .../tests/gtest_lru_file_cache.cpp | 53 ++++++++++--------- .../MergeTree/MergeTreePrefetchedReadPool.cpp | 3 ++ src/Storages/MergeTree/MergeTreeReadPool.cpp | 42 ++++++++++++--- src/Storages/MergeTree/MergeTreeReadPool.h | 5 +- src/Storages/MergeTree/MergeTreeSource.cpp | 2 + 19 files changed, 160 insertions(+), 66 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 7f1fe838b80..465e27b0985 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -645,6 +645,8 @@ class IColumn; M(UInt64, merge_tree_min_rows_for_concurrent_read_for_remote_filesystem, (20 * 8192), "If at least as many lines are read from one file, the reading can be parallelized, when reading from remote filesystem.", 0) \ M(UInt64, merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem, (24 * 10 * 1024 * 1024), "If at least as many bytes are read from one file, the reading can be parallelized, when reading from remote filesystem.", 0) \ M(UInt64, remote_read_min_bytes_for_seek, 4 * DBMS_DEFAULT_BUFFER_SIZE, "Min bytes required for remote read (url, s3) to do seek, instead of read with ignore.", 0) \ + M(UInt64, merge_tree_min_bytes_per_task_for_remote_reading, 4 * DBMS_DEFAULT_BUFFER_SIZE, "Min bytes to read per task.", 0) \ + M(Bool, merge_tree_use_const_size_tasks_for_remote_reading, true, "Whether to use constant size tasks for reading from a remote table.", 0) \ \ M(UInt64, async_insert_threads, 16, "Maximum number of threads to actually parse and insert data in background. Zero means asynchronous mode is disabled", 0) \ M(Bool, async_insert, false, "If true, data from INSERT query is stored in queue and later flushed to table in background. Makes sense only for inserts via HTTP protocol. If wait_for_async_insert is false, INSERT query is processed almost instantly, otherwise client will wait until data will be flushed to table", 0) \ diff --git a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp index 9448bbaf798..1dbe22a431f 100644 --- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp +++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp @@ -42,21 +42,25 @@ namespace ErrorCodes extern const int ARGUMENT_OUT_OF_BOUND; } +static size_t chooseBufferSize(const ReadSettings & settings, size_t file_size) +{ + /// Buffers used for prefetch or pre-download better to have enough size, but not bigger than the whole file. + return std::min(std::max(settings.prefetch_buffer_size, DBMS_DEFAULT_BUFFER_SIZE), file_size); +} AsynchronousReadIndirectBufferFromRemoteFS::AsynchronousReadIndirectBufferFromRemoteFS( - IAsynchronousReader & reader_, - const ReadSettings & settings_, - std::shared_ptr impl_, - size_t min_bytes_for_seek_) - : ReadBufferFromFileBase(settings_.remote_fs_buffer_size, nullptr, 0) + IAsynchronousReader & reader_, + const ReadSettings & settings_, + std::shared_ptr impl_, + size_t min_bytes_for_seek_) + : ReadBufferFromFileBase(chooseBufferSize(settings_, impl_->getFileSize()), nullptr, 0) , read_settings(settings_) , reader(reader_) , base_priority(settings_.priority) , impl(impl_) - , prefetch_buffer(settings_.prefetch_buffer_size) + , prefetch_buffer(chooseBufferSize(settings_, impl->getFileSize())) , min_bytes_for_seek(min_bytes_for_seek_) - , query_id(CurrentThread::isInitialized() && CurrentThread::get().getQueryContext() != nullptr - ? CurrentThread::getQueryId() : "") + , query_id(CurrentThread::isInitialized() && CurrentThread::get().getQueryContext() != nullptr ? CurrentThread::getQueryId() : "") , current_reader_id(getRandomASCIIString(8)) #ifndef NDEBUG , log(&Poco::Logger::get("AsynchronousBufferFromRemoteFS")) @@ -135,11 +139,11 @@ void AsynchronousReadIndirectBufferFromRemoteFS::prefetch(int64_t priority) if (!hasPendingDataToRead()) return; - last_prefetch_info.submit_time = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); + last_prefetch_info.submit_time = std::chrono::system_clock::now(); last_prefetch_info.priority = priority; /// Prefetch even in case hasPendingData() == true. - chassert(prefetch_buffer.size() == read_settings.prefetch_buffer_size || prefetch_buffer.size() == read_settings.remote_fs_buffer_size); + chassert(prefetch_buffer.size() == chooseBufferSize(read_settings, impl->getFileSize()) || prefetch_buffer.size() == read_settings.remote_fs_buffer_size); prefetch_future = asyncReadInto(prefetch_buffer.data(), prefetch_buffer.size(), priority); ProfileEvents::increment(ProfileEvents::RemoteFSPrefetches); } @@ -224,7 +228,7 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl() { ProfileEventTimeIncrement watch(ProfileEvents::SynchronousRemoteReadWaitMicroseconds); - chassert(memory.size() == read_settings.prefetch_buffer_size || memory.size() == read_settings.remote_fs_buffer_size); + chassert(memory.size() == chooseBufferSize(read_settings, impl->getFileSize()) || memory.size() == read_settings.remote_fs_buffer_size); std::tie(size, offset) = impl->readInto(memory.data(), memory.size(), file_offset_of_buffer_end, bytes_to_ignore); bytes_to_ignore = 0; diff --git a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h index 8cb0e2826b4..e72bbe15e5b 100644 --- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h +++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h @@ -1,11 +1,12 @@ #pragma once -#include "config.h" -#include +#include +#include #include +#include #include #include -#include +#include "config.h" namespace Poco { class Logger; } @@ -97,7 +98,7 @@ private: struct LastPrefetchInfo { - UInt64 submit_time = 0; + std::chrono::system_clock::time_point submit_time; size_t priority = 0; }; LastPrefetchInfo last_prefetch_info; diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index 68efd3f5d78..66a4d51abaa 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -48,7 +48,7 @@ CachedOnDiskReadBufferFromFile::CachedOnDiskReadBufferFromFile( bool allow_seeks_after_first_read_, bool use_external_buffer_, std::optional read_until_position_) - : ReadBufferFromFileBase(settings_.remote_fs_buffer_size, nullptr, 0, file_size_) + : ReadBufferFromFileBase(use_external_buffer_ ? 0 : settings_.remote_fs_buffer_size, nullptr, 0, file_size_) #ifndef NDEBUG , log(&Poco::Logger::get("CachedOnDiskReadBufferFromFile(" + source_file_path_ + ")")) #else @@ -120,7 +120,7 @@ void CachedOnDiskReadBufferFromFile::initialize(size_t offset, size_t size) else { CreateFileSegmentSettings create_settings(is_persistent ? FileSegmentKind::Persistent : FileSegmentKind::Regular); - file_segments_holder.emplace(cache->getOrSet(cache_key, offset, size, create_settings)); + file_segments_holder.emplace(cache->getOrSet(cache_key, offset, size, file_size.value(), create_settings)); } /** @@ -150,6 +150,8 @@ CachedOnDiskReadBufferFromFile::getCacheReadBuffer(const FileSegment & file_segm ReadSettings local_read_settings{settings}; /// Do not allow to use asynchronous version of LocalFSReadMethod. local_read_settings.local_fs_method = LocalFSReadMethod::pread; + if (use_external_buffer) + local_read_settings.local_fs_buffer_size = 0; // The buffer will unnecessarily allocate a Memory of size local_fs_buffer_size, which will then // most likely be unused because we're swap()ping our own internal_buffer into @@ -538,6 +540,9 @@ void CachedOnDiskReadBufferFromFile::predownload(FileSegmentPtr & file_segment) ProfileEvents::FileSegmentPredownloadMicroseconds, predownload_watch.elapsedMicroseconds()); }); + OpenTelemetry::SpanHolder span{ + fmt::format("CachedOnDiskReadBufferFromFile::predownload(key={}, size={})", file_segment->key().toString(), bytes_to_predownload)}; + if (bytes_to_predownload) { /// Consider this case. Some user needed segment [a, b] and downloaded it partially. @@ -806,6 +811,8 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep() if (current_file_segment_it == file_segments_holder->file_segments.end()) return false; + const size_t original_buffer_size = internal_buffer.size(); + bool implementation_buffer_can_be_reused = false; SCOPE_EXIT({ try @@ -831,6 +838,9 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep() } } + if (use_external_buffer && initialized) + internal_buffer.resize(original_buffer_size); + chassert(!file_segment->isDownloader()); } catch (...) @@ -857,6 +867,11 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep() chassert(!internal_buffer.empty()); + /// We allocate buffers not less than 1M so that s3 requests will not be too small. But the same buffers (members of AsynchronousReadIndirectBufferFromRemoteFS) + /// are used for reading from files. Some of these readings are fairly small and their performance degrade when we use big buffers (up to ~20% for queries like Q23 from ClickBench). + if (use_external_buffer && read_type == ReadType::CACHED && settings.local_fs_buffer_size < internal_buffer.size()) + internal_buffer.resize(settings.local_fs_buffer_size); + // Pass a valid external buffer for implementation_buffer to read into. // We then take it back with another swap() after reading is done. // (If we get an exception in between, we'll be left with an invalid internal_buffer. That's ok, as long as diff --git a/src/Disks/IO/ThreadPoolRemoteFSReader.cpp b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp index 4d0f39357ab..b8ec98f6044 100644 --- a/src/Disks/IO/ThreadPoolRemoteFSReader.cpp +++ b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp @@ -86,7 +86,7 @@ std::future ThreadPoolRemoteFSReader::submit(Reques auto * remote_fs_fd = assert_cast(request.descriptor.get()); - auto watch = std::make_unique(CLOCK_MONOTONIC); + auto watch = std::make_unique(CLOCK_REALTIME); Result result = remote_fs_fd->readInto(request.buf, request.size, request.offset, request.ignore); watch->stop(); diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp index 39399c9ce09..cc39255eb97 100644 --- a/src/Interpreters/Cache/FileCache.cpp +++ b/src/Interpreters/Cache/FileCache.cpp @@ -14,6 +14,20 @@ #include +namespace +{ + +size_t roundDownToMultiple(size_t num, size_t multiple) +{ + return (num / multiple) * multiple; +} + +size_t roundUpToMultiple(size_t num, size_t multiple) +{ + return roundDownToMultiple(num + multiple - 1, multiple); +} +} + namespace fs = std::filesystem; namespace DB @@ -37,6 +51,7 @@ FileCache::FileCache(const FileCacheSettings & settings) , main_priority(std::make_unique()) , stash_priority(std::make_unique()) , max_stash_element_size(settings.max_elements) + , boundary_alignment(settings.boundary_alignment) { } @@ -194,8 +209,7 @@ FileCache::FileSegmentCell * FileCache::getCell( return &cell_it->second; } -FileSegments FileCache::getImpl( - const Key & key, const FileSegment::Range & range, std::lock_guard & cache_lock) +FileSegments FileCache::getImpl(const Key & key, const FileSegment::Range & range, std::lock_guard & cache_lock) { /// Given range = [left, right] and non-overlapping ordered set of file segments, /// find list [segment1, ..., segmentN] of segments which intersect with given range. @@ -409,8 +423,13 @@ void FileCache::fillHolesWithEmptyFileSegments( } } -FileSegmentsHolder FileCache::getOrSet(const Key & key, size_t offset, size_t size, const CreateFileSegmentSettings & settings) +FileSegmentsHolder +FileCache::getOrSet(const Key & key, size_t offset, size_t size, size_t file_size, const CreateFileSegmentSettings & settings) { + const auto aligned_offset = roundDownToMultiple(offset, boundary_alignment); + const auto aligned_end = std::min(roundUpToMultiple(offset + size, boundary_alignment), file_size); + const auto aligned_size = aligned_end - aligned_offset; + std::lock_guard cache_lock(mutex); assertInitialized(cache_lock); @@ -419,18 +438,25 @@ FileSegmentsHolder FileCache::getOrSet(const Key & key, size_t offset, size_t si assertCacheCorrectness(key, cache_lock); #endif - FileSegment::Range range(offset, offset + size - 1); + FileSegment::Range range(aligned_offset, aligned_offset + aligned_size - 1); /// Get all segments which intersect with the given range. auto file_segments = getImpl(key, range, cache_lock); if (file_segments.empty()) { - file_segments = splitRangeIntoCells(key, offset, size, FileSegment::State::EMPTY, settings, cache_lock); + file_segments = splitRangeIntoCells(key, range.left, range.size(), FileSegment::State::EMPTY, settings, cache_lock); } else { - fillHolesWithEmptyFileSegments(file_segments, key, range, /* fill_with_detached */false, settings, cache_lock); + fillHolesWithEmptyFileSegments(file_segments, key, range, /* fill_with_detached */ false, settings, cache_lock); } + + while (!file_segments.empty() && file_segments.front()->range().right < offset) + file_segments.pop_front(); + + while (!file_segments.empty() && file_segments.back()->range().left >= offset + size) + file_segments.pop_back(); + assert(!file_segments.empty()); return FileSegmentsHolder(std::move(file_segments)); } diff --git a/src/Interpreters/Cache/FileCache.h b/src/Interpreters/Cache/FileCache.h index 83435b67562..fa365f70200 100644 --- a/src/Interpreters/Cache/FileCache.h +++ b/src/Interpreters/Cache/FileCache.h @@ -58,7 +58,8 @@ public: * As long as pointers to returned file segments are held * it is guaranteed that these file segments are not removed from cache. */ - FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size, const CreateFileSegmentSettings & settings); + FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size, size_t file_size, const CreateFileSegmentSettings & settings); + FileSegmentsHolder set(const Key & key, size_t offset, size_t size, const CreateFileSegmentSettings & settings); /** @@ -214,6 +215,7 @@ private: FileCacheRecords stash_records; std::unique_ptr stash_priority; size_t max_stash_element_size; + size_t boundary_alignment; void loadCacheInfoIntoMemory(std::lock_guard & cache_lock); diff --git a/src/Interpreters/Cache/FileCacheSettings.cpp b/src/Interpreters/Cache/FileCacheSettings.cpp index 9d5282047aa..afee69a1f94 100644 --- a/src/Interpreters/Cache/FileCacheSettings.cpp +++ b/src/Interpreters/Cache/FileCacheSettings.cpp @@ -48,6 +48,8 @@ void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration & bypass_cache_threashold = REMOTE_FS_OBJECTS_CACHE_BYPASS_THRESHOLD; do_not_evict_index_and_mark_files = config.getUInt64(config_prefix + ".do_not_evict_index_and_mark_files", false); + + boundary_alignment = config.getUInt64(config_prefix + ".boundary_alignment", DBMS_DEFAULT_BUFFER_SIZE); } } diff --git a/src/Interpreters/Cache/FileCacheSettings.h b/src/Interpreters/Cache/FileCacheSettings.h index 689c3ef70fb..8c765631066 100644 --- a/src/Interpreters/Cache/FileCacheSettings.h +++ b/src/Interpreters/Cache/FileCacheSettings.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -26,6 +27,8 @@ struct FileCacheSettings bool enable_bypass_cache_with_threashold = false; size_t bypass_cache_threashold = REMOTE_FS_OBJECTS_CACHE_BYPASS_THRESHOLD; + size_t boundary_alignment = DBMS_DEFAULT_BUFFER_SIZE; + void loadFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix); }; diff --git a/src/Interpreters/Cache/FileCache_fwd.h b/src/Interpreters/Cache/FileCache_fwd.h index 72dc1144fb9..e9da0d21f7e 100644 --- a/src/Interpreters/Cache/FileCache_fwd.h +++ b/src/Interpreters/Cache/FileCache_fwd.h @@ -4,7 +4,7 @@ namespace DB { -static constexpr int REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE = 100 * 1024 * 1024; +static constexpr int REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE = 8 * 1024 * 1024; static constexpr int REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS = 1024 * 1024; static constexpr int REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD = 0; static constexpr size_t REMOTE_FS_OBJECTS_CACHE_BYPASS_THRESHOLD = 256 * 1024 * 1024;; diff --git a/src/Interpreters/Cache/FileSegment.cpp b/src/Interpreters/Cache/FileSegment.cpp index 6ae25c681d4..6a99ea322d3 100644 --- a/src/Interpreters/Cache/FileSegment.cpp +++ b/src/Interpreters/Cache/FileSegment.cpp @@ -426,6 +426,8 @@ void FileSegment::write(const char * from, size_t size, size_t offset) FileSegment::State FileSegment::wait() { + OpenTelemetry::SpanHolder span{fmt::format("FileSegment::wait({})", key().toString())}; + std::unique_lock segment_lock(mutex); if (is_detached) diff --git a/src/Interpreters/FilesystemReadPrefetchesLog.cpp b/src/Interpreters/FilesystemReadPrefetchesLog.cpp index 9dd0fce84b0..664299017bc 100644 --- a/src/Interpreters/FilesystemReadPrefetchesLog.cpp +++ b/src/Interpreters/FilesystemReadPrefetchesLog.cpp @@ -1,9 +1,9 @@ -#include #include #include #include #include #include +#include namespace DB @@ -39,12 +39,12 @@ void FilesystemReadPrefetchesLogElement::appendToBlock(MutableColumns & columns) columns[i++]->insert(path); columns[i++]->insert(offset); columns[i++]->insert(size); - columns[i++]->insert(prefetch_submit_time); + columns[i++]->insert(std::chrono::duration_cast(prefetch_submit_time.time_since_epoch()).count()); columns[i++]->insert(priority); if (execution_watch) { - columns[i++]->insert(execution_watch->getStart()); - columns[i++]->insert(execution_watch->getEnd()); + columns[i++]->insert(execution_watch->getStart() / 1000); + columns[i++]->insert(execution_watch->getEnd() / 1000); columns[i++]->insert(execution_watch->elapsedMicroseconds()); } else diff --git a/src/Interpreters/FilesystemReadPrefetchesLog.h b/src/Interpreters/FilesystemReadPrefetchesLog.h index a7672c49d91..685909d41b7 100644 --- a/src/Interpreters/FilesystemReadPrefetchesLog.h +++ b/src/Interpreters/FilesystemReadPrefetchesLog.h @@ -23,7 +23,7 @@ struct FilesystemReadPrefetchesLogElement String path; UInt64 offset; Int64 size; /// -1 means unknown - Decimal64 prefetch_submit_time{}; + std::chrono::system_clock::time_point prefetch_submit_time; std::optional execution_watch; size_t priority; FilesystemPrefetchState state; diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 00a5d0ed1d8..96bcc89c1d2 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -321,6 +321,8 @@ static std::tuple executeQueryImpl( /// This does not have impact on the final span logs, because these internal queries are issued by external queries, /// we still have enough span logs for the execution of external queries. std::shared_ptr query_span = internal ? nullptr : std::make_shared("query"); + if (query_span) + LOG_DEBUG(&Poco::Logger::get("executeQuery"), "Query span trace_id for opentelemetry log: {}", query_span->trace_id); auto query_start_time = std::chrono::system_clock::now(); diff --git a/src/Interpreters/tests/gtest_lru_file_cache.cpp b/src/Interpreters/tests/gtest_lru_file_cache.cpp index 0754c394f66..3ba13b218ec 100644 --- a/src/Interpreters/tests/gtest_lru_file_cache.cpp +++ b/src/Interpreters/tests/gtest_lru_file_cache.cpp @@ -138,6 +138,7 @@ TEST_F(FileCacheTest, get) settings.base_path = cache_base_path; settings.max_size = 30; settings.max_elements = 5; + settings.boundary_alignment = 1; { auto cache = DB::FileCache(settings); @@ -145,7 +146,7 @@ TEST_F(FileCacheTest, get) auto key = cache.hash("key1"); { - auto holder = cache.getOrSet(key, 0, 10, {}); /// Add range [0, 9] + auto holder = cache.getOrSet(key, 0, 10, 10, {}); /// Add range [0, 9] auto segments = fromHolder(holder); /// Range was not present in cache. It should be added in cache as one while file segment. ASSERT_EQ(segments.size(), 1); @@ -174,7 +175,7 @@ TEST_F(FileCacheTest, get) { /// Want range [5, 14], but [0, 9] already in cache, so only [10, 14] will be put in cache. - auto holder = cache.getOrSet(key, 5, 10, {}); + auto holder = cache.getOrSet(key, 5, 10, 15, {}); auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 2); @@ -194,14 +195,14 @@ TEST_F(FileCacheTest, get) ASSERT_EQ(cache.getUsedCacheSize(), 15); { - auto holder = cache.getOrSet(key, 9, 1, {}); /// Get [9, 9] + auto holder = cache.getOrSet(key, 9, 1, 10, {}); /// Get [9, 9] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 1); assertRange(7, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED); } { - auto holder = cache.getOrSet(key, 9, 2, {}); /// Get [9, 10] + auto holder = cache.getOrSet(key, 9, 2, 11, {}); /// Get [9, 10] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 2); assertRange(8, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED); @@ -209,15 +210,15 @@ TEST_F(FileCacheTest, get) } { - auto holder = cache.getOrSet(key, 10, 1, {}); /// Get [10, 10] + auto holder = cache.getOrSet(key, 10, 1, 11, {}); /// Get [10, 10] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 1); assertRange(10, segments[0], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED); } - complete(cache_base_path, cache.getOrSet(key, 17, 4, {})); /// Get [17, 20] - complete(cache_base_path, cache.getOrSet(key, 24, 3, {})); /// Get [24, 26] - /// completeWithState(cache.getOrSet(key, 27, 1, false)); /// Get [27, 27] + complete(cache_base_path, cache.getOrSet(key, 17, 4, 21, {})); /// Get [17, 20] + complete(cache_base_path, cache.getOrSet(key, 24, 3, 27, {})); /// Get [24, 26] + /// completeWithState(cache.getOrSet(key, 27, 1, 28, false)); /// Get [27, 27] /// Current cache: [__________][_____] [____] [___][] /// ^ ^^ ^ ^ ^ ^ ^^^ @@ -227,7 +228,7 @@ TEST_F(FileCacheTest, get) ASSERT_EQ(cache.getUsedCacheSize(), 22); { - auto holder = cache.getOrSet(key, 0, 26, {}); /// Get [0, 25] + auto holder = cache.getOrSet(key, 0, 26, 26, {}); /// Get [0, 25] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 6); @@ -261,14 +262,14 @@ TEST_F(FileCacheTest, get) /// as max elements size is reached, next attempt to put something in cache should fail. /// This will also check that [27, 27] was indeed evicted. - auto holder1 = cache.getOrSet(key, 27, 1, {}); + auto holder1 = cache.getOrSet(key, 27, 1, 28, {}); auto segments_1 = fromHolder(holder1); /// Get [27, 27] ASSERT_EQ(segments_1.size(), 1); assertRange(17, segments_1[0], DB::FileSegment::Range(27, 27), DB::FileSegment::State::EMPTY); } { - auto holder = cache.getOrSet(key, 12, 10, {}); /// Get [12, 21] + auto holder = cache.getOrSet(key, 12, 10, 22, {}); /// Get [12, 21] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 4); @@ -292,7 +293,7 @@ TEST_F(FileCacheTest, get) ASSERT_EQ(cache.getFileSegmentsNum(), 5); { - auto holder = cache.getOrSet(key, 23, 5, {}); /// Get [23, 28] + auto holder = cache.getOrSet(key, 23, 5, 28, {}); /// Get [23, 28] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 3); @@ -313,12 +314,12 @@ TEST_F(FileCacheTest, get) /// 17 21 2324 26 28 { - auto holder5 = cache.getOrSet(key, 2, 3, {}); /// Get [2, 4] + auto holder5 = cache.getOrSet(key, 2, 3, 5, {}); /// Get [2, 4] auto s5 = fromHolder(holder5); ASSERT_EQ(s5.size(), 1); assertRange(25, s5[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::EMPTY); - auto holder1 = cache.getOrSet(key, 30, 2, {}); /// Get [30, 31] + auto holder1 = cache.getOrSet(key, 30, 2, 32, {}); /// Get [30, 31] auto s1 = fromHolder(holder1); ASSERT_EQ(s1.size(), 1); assertRange(26, s1[0], DB::FileSegment::Range(30, 31), DB::FileSegment::State::EMPTY); @@ -334,20 +335,20 @@ TEST_F(FileCacheTest, get) /// ^ ^ ^ ^ ^ ^ ^ ^ /// 2 4 23 24 26 27 30 31 - auto holder2 = cache.getOrSet(key, 23, 1, {}); /// Get [23, 23] + auto holder2 = cache.getOrSet(key, 23, 1, 24, {}); /// Get [23, 23] auto s2 = fromHolder(holder2); ASSERT_EQ(s2.size(), 1); - auto holder3 = cache.getOrSet(key, 24, 3, {}); /// Get [24, 26] + auto holder3 = cache.getOrSet(key, 24, 3, 27, {}); /// Get [24, 26] auto s3 = fromHolder(holder3); ASSERT_EQ(s3.size(), 1); - auto holder4 = cache.getOrSet(key, 27, 1, {}); /// Get [27, 27] + auto holder4 = cache.getOrSet(key, 27, 1, 28, {}); /// Get [27, 27] auto s4 = fromHolder(holder4); ASSERT_EQ(s4.size(), 1); /// All cache is now unreleasable because pointers are still hold - auto holder6 = cache.getOrSet(key, 0, 40, {}); + auto holder6 = cache.getOrSet(key, 0, 40, 40, {}); auto f = fromHolder(holder6); ASSERT_EQ(f.size(), 9); @@ -368,7 +369,7 @@ TEST_F(FileCacheTest, get) } { - auto holder = cache.getOrSet(key, 2, 3, {}); /// Get [2, 4] + auto holder = cache.getOrSet(key, 2, 3, 5, {}); /// Get [2, 4] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 1); assertRange(31, segments[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::DOWNLOADED); @@ -379,7 +380,7 @@ TEST_F(FileCacheTest, get) /// 2 4 23 24 26 27 30 31 { - auto holder = cache.getOrSet(key, 25, 5, {}); /// Get [25, 29] + auto holder = cache.getOrSet(key, 25, 5, 30, {}); /// Get [25, 29] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 3); @@ -403,7 +404,7 @@ TEST_F(FileCacheTest, get) chassert(&DB::CurrentThread::get() == &thread_status_1); DB::CurrentThread::QueryScope query_scope_holder_1(query_context_1); - auto holder_2 = cache.getOrSet(key, 25, 5, {}); /// Get [25, 29] once again. + auto holder_2 = cache.getOrSet(key, 25, 5, 30, {}); /// Get [25, 29] once again. auto segments_2 = fromHolder(holder_2); ASSERT_EQ(segments.size(), 3); @@ -446,7 +447,7 @@ TEST_F(FileCacheTest, get) /// and notify_all() is also called from destructor of holder. std::optional holder; - holder.emplace(cache.getOrSet(key, 3, 23, {})); /// Get [3, 25] + holder.emplace(cache.getOrSet(key, 3, 23, 26, {})); /// Get [3, 25] auto segments = fromHolder(*holder); ASSERT_EQ(segments.size(), 3); @@ -472,7 +473,7 @@ TEST_F(FileCacheTest, get) chassert(&DB::CurrentThread::get() == &thread_status_1); DB::CurrentThread::QueryScope query_scope_holder_1(query_context_1); - auto holder_2 = cache.getOrSet(key, 3, 23, {}); /// Get [3, 25] once again + auto holder_2 = cache.getOrSet(key, 3, 23, 26, {}); /// Get [3, 25] once again auto segments_2 = fromHolder(*holder); ASSERT_EQ(segments_2.size(), 3); @@ -521,7 +522,7 @@ TEST_F(FileCacheTest, get) cache2.initialize(); auto key = cache2.hash("key1"); - auto holder1 = cache2.getOrSet(key, 2, 28, {}); /// Get [2, 29] + auto holder1 = cache2.getOrSet(key, 2, 28, 30, {}); /// Get [2, 29] auto segments1 = fromHolder(holder1); ASSERT_EQ(segments1.size(), 5); @@ -543,7 +544,7 @@ TEST_F(FileCacheTest, get) cache2.initialize(); auto key = cache2.hash("key1"); - auto holder1 = cache2.getOrSet(key, 0, 25, {}); /// Get [0, 24] + auto holder1 = cache2.getOrSet(key, 0, 25, 25, {}); /// Get [0, 24] auto segments1 = fromHolder(holder1); ASSERT_EQ(segments1.size(), 3); @@ -668,7 +669,7 @@ TEST_F(FileCacheTest, temporaryData) auto tmp_data_scope = std::make_shared(nullptr, &file_cache, 0); - auto some_data_holder = file_cache.getOrSet(file_cache.hash("some_data"), 0, 5_KiB, CreateFileSegmentSettings{}); + auto some_data_holder = file_cache.getOrSet(file_cache.hash("some_data"), 0, 5_KiB, 5_KiB, CreateFileSegmentSettings{}); { auto segments = fromHolder(some_data_holder); diff --git a/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp b/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp index 7428fd5c056..e51120f9305 100644 --- a/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp @@ -366,6 +366,9 @@ MergeTreePrefetchedReadPool::PartsInfos MergeTreePrefetchedReadPool::getPartsInf part_info->column_name_set = {required_column_names.begin(), required_column_names.end()}; part_info->task_columns = task_columns; + if (settings.prefetch_buffer_size < DBMS_DEFAULT_BUFFER_SIZE) + throw Exception(ErrorCodes::LOGICAL_ERROR, "remove me"); + /// adjustBufferSize(), which is done in MergeTreeReaderStream and MergeTreeReaderCompact, /// lowers buffer size if file size (or required read range) is less. So we know that the /// settings.prefetch_buffer_size will be lowered there, therefore we account it here as well. diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index 29be06b4e6a..931a1be6b30 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -21,6 +21,14 @@ namespace ErrorCodes namespace DB { +size_t getApproxSizeOfPart(const IMergeTreeDataPart & part, const Names & columns_to_read) +{ + ColumnSize columns_size{}; + for (const auto & col_name : columns_to_read) + columns_size.add(part.getColumnSize(col_name)); + return columns_size.data_compressed; +} + MergeTreeReadPool::MergeTreeReadPool( size_t threads_, size_t sum_marks_, @@ -44,13 +52,36 @@ MergeTreeReadPool::MergeTreeReadPool( , parts_ranges(std::move(parts_)) , predict_block_size_bytes(context_->getSettingsRef().preferred_block_size_bytes > 0) , do_not_steal_tasks(do_not_steal_tasks_) + , merge_tree_use_const_size_tasks_for_remote_reading(context_->getSettingsRef().merge_tree_use_const_size_tasks_for_remote_reading) , backoff_settings{context_->getSettingsRef()} , backoff_state{threads_} { + const auto & settings = context_->getSettingsRef(); + + size_t total_compressed_bytes = 0; + size_t total_marks = 0; + for (const auto & part : parts_ranges) + { + total_compressed_bytes += getApproxSizeOfPart( + *part.data_part, prewhere_info ? prewhere_info->prewhere_actions->getRequiredColumnsNames() : column_names_); + total_marks += part.getMarksCount(); + } + + if (total_marks) + { + const auto min_bytes_per_task = settings.merge_tree_min_bytes_per_task_for_remote_reading; + const auto avg_mark_bytes = std::max(total_compressed_bytes / total_marks, 1); + const auto heuristic_min_marks = std::min(total_marks / threads_, min_bytes_per_task / avg_mark_bytes); + if (heuristic_min_marks > min_marks_for_concurrent_read) + { + min_marks_for_concurrent_read = heuristic_min_marks; + } + } + /// parts don't contain duplicate MergeTreeDataPart's. const auto per_part_sum_marks = fillPerPartInfo( parts_ranges, storage_snapshot, is_part_on_remote_disk, - do_not_steal_tasks, predict_block_size_bytes, + predict_block_size_bytes, column_names, virtual_column_names, prewhere_info, actions_settings, reader_settings, per_part_params); @@ -61,7 +92,6 @@ std::vector MergeTreeReadPool::fillPerPartInfo( const RangesInDataParts & parts, const StorageSnapshotPtr & storage_snapshot, std::vector & is_part_on_remote_disk, - bool & do_not_steal_tasks, bool & predict_block_size_bytes, const Names & column_names, const Names & virtual_column_names, @@ -83,7 +113,6 @@ std::vector MergeTreeReadPool::fillPerPartInfo( bool part_on_remote_disk = part.data_part->isStoredOnRemoteDisk(); is_part_on_remote_disk[i] = part_on_remote_disk; - do_not_steal_tasks |= part_on_remote_disk; /// Read marks for every data part. size_t sum_marks = 0; @@ -157,14 +186,13 @@ MergeTreeReadTaskPtr MergeTreeReadPool::getTask(size_t thread) auto & marks_in_part = thread_tasks.sum_marks_in_parts.back(); size_t need_marks; - if (is_part_on_remote_disk[part_idx]) /// For better performance with remote disks + if (is_part_on_remote_disk[part_idx] && !merge_tree_use_const_size_tasks_for_remote_reading) need_marks = marks_in_part; else /// Get whole part to read if it is small enough. need_marks = std::min(marks_in_part, min_marks_for_concurrent_read); /// Do not leave too little rows in part for next time. - if (marks_in_part > need_marks && - marks_in_part - need_marks < min_marks_for_concurrent_read) + if (marks_in_part > need_marks && marks_in_part - need_marks < min_marks_for_concurrent_read / 2) need_marks = marks_in_part; MarkRanges ranges_to_get_from_part; @@ -294,6 +322,8 @@ void MergeTreeReadPool::fillPerThreadInfo( parts_queue.push(std::move(info.second)); } + LOG_DEBUG(log, "min_marks_for_concurrent_read={}", min_marks_for_concurrent_read); + const size_t min_marks_per_thread = (sum_marks - 1) / threads + 1; for (size_t i = 0; i < threads && !parts_queue.empty(); ++i) diff --git a/src/Storages/MergeTree/MergeTreeReadPool.h b/src/Storages/MergeTree/MergeTreeReadPool.h index b3356ec3351..514150566a6 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.h +++ b/src/Storages/MergeTree/MergeTreeReadPool.h @@ -92,7 +92,6 @@ public: const RangesInDataParts & parts, const StorageSnapshotPtr & storage_snapshot, std::vector & is_part_on_remote_disk, - bool & do_not_steal_tasks, bool & predict_block_size_bytes, const Names & column_names, const Names & virtual_column_names, @@ -117,6 +116,7 @@ private: RangesInDataParts parts_ranges; bool predict_block_size_bytes; bool do_not_steal_tasks; + bool merge_tree_use_const_size_tasks_for_remote_reading = false; std::vector per_part_params; std::vector is_part_on_remote_disk; @@ -188,7 +188,7 @@ public: , parts_ranges(std::move(parts_)) { MergeTreeReadPool::fillPerPartInfo( - parts_ranges, storage_snapshot, is_part_on_remote_disk, do_not_steal_tasks, + parts_ranges, storage_snapshot, is_part_on_remote_disk, predict_block_size_bytes, column_names, virtual_column_names, prewhere_info, actions_settings, reader_settings, per_part_params); @@ -225,7 +225,6 @@ private: const Names virtual_column_names; RangesInDataParts parts_ranges; - bool do_not_steal_tasks = false; bool predict_block_size_bytes = false; std::vector is_part_on_remote_disk; std::vector per_part_params; diff --git a/src/Storages/MergeTree/MergeTreeSource.cpp b/src/Storages/MergeTree/MergeTreeSource.cpp index 328336ff71a..6cf6cd48534 100644 --- a/src/Storages/MergeTree/MergeTreeSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSource.cpp @@ -207,6 +207,7 @@ std::optional MergeTreeSource::tryGenerate() try { + OpenTelemetry::SpanHolder span{"MergeTreeSource::tryGenerate()"}; holder->setResult(algorithm->read()); } catch (...) @@ -221,6 +222,7 @@ std::optional MergeTreeSource::tryGenerate() } #endif + OpenTelemetry::SpanHolder span{"MergeTreeSource::tryGenerate()"}; return processReadResult(algorithm->read()); } From e20f92ce0f6ef4d06813932025cdea10a361631c Mon Sep 17 00:00:00 2001 From: Aleksei Golub Date: Thu, 27 Apr 2023 21:26:36 +0300 Subject: [PATCH 0038/1072] Fixed exceptions handling; Fixed style; --- programs/local/LocalServer.cpp | 10 +++++- src/Databases/DatabaseFactory.cpp | 2 +- src/Databases/DatabaseFilesystem.cpp | 48 +++++++++++++++++++++++---- src/Databases/DatabasesOverlay.cpp | 49 ++++++---------------------- src/Databases/DatabasesOverlay.h | 2 -- 5 files changed, 62 insertions(+), 49 deletions(-) diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 566d11791ca..4939997b323 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -149,6 +150,13 @@ static DatabasePtr createMemoryDatabaseIfNotExists(ContextPtr context, const Str return system_database; } +static DatabasePtr createClickHouseLocalDatabaseOverlay(const String & name_, ContextPtr context_) +{ + auto databaseCombiner = std::make_shared(name_, context_); + databaseCombiner->registerNextDatabase(std::make_shared(name_, "", context_)); + databaseCombiner->registerNextDatabase(std::make_shared(name_, context_)); + return databaseCombiner; +} /// If path is specified and not empty, will try to setup server environment and load existing metadata void LocalServer::tryInitPath() @@ -648,7 +656,7 @@ void LocalServer::processConfig() * if such tables will not be dropped, clickhouse-server will not be able to load them due to security reasons. */ std::string default_database = config().getString("default_database", "_local"); - DatabaseCatalog::instance().attachDatabase(default_database, CreateClickHouseLocalDatabaseOverlay(default_database, global_context)); + DatabaseCatalog::instance().attachDatabase(default_database, createClickHouseLocalDatabaseOverlay(default_database, global_context)); global_context->setCurrentDatabase(default_database); applyCmdOptions(global_context); diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index 8a50c31efc8..1be0d5dd7b2 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -443,7 +443,7 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String if (engine->arguments && !engine->arguments->children.empty()) { if (engine->arguments->children.size() != 1) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Filesystem database requires at most 1 argument: filesystem_path"); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Filesystem database requires exactly 1 argument: filesystem_path"); const auto & arguments = engine->arguments->children; init_path = safeGetLiteralValue(arguments[0], engine_name); diff --git a/src/Databases/DatabaseFilesystem.cpp b/src/Databases/DatabaseFilesystem.cpp index 16aed185669..8275bdf6151 100644 --- a/src/Databases/DatabaseFilesystem.cpp +++ b/src/Databases/DatabaseFilesystem.cpp @@ -20,24 +20,42 @@ namespace DB namespace ErrorCodes { + extern const int LOGICAL_ERROR; extern const int UNKNOWN_TABLE; + extern const int DATABASE_ACCESS_DENIED; + extern const int BAD_ARGUMENTS; } DatabaseFilesystem::DatabaseFilesystem(const String & name_, const String & path_, ContextPtr context_) : IDatabase(name_), WithContext(context_->getGlobalContext()), path(path_), log(&Poco::Logger::get("DatabaseFileSystem(" + name_ + ")")) { - path = fs::path(path).lexically_normal().string(); + fs::path user_files_path; + if (context_->getApplicationType() != Context::ApplicationType::LOCAL) + user_files_path = fs::canonical(fs::path(getContext()->getUserFilesPath())); + + if (fs::path(path).is_relative()) + path = user_files_path / path; + + path = fs::absolute(path).lexically_normal().string(); } std::string DatabaseFilesystem::getTablePath(const std::string & table_name) const { - return fs::path(path) / table_name; + fs::path table_path = fs::path(path) / table_name; + return table_path.lexically_normal().string(); } void DatabaseFilesystem::addTable(const std::string & table_name, StoragePtr table_storage) const { std::lock_guard lock(mutex); - loaded_tables.emplace(table_name, table_storage); + auto [_, inserted] = loaded_tables.emplace(table_name, table_storage); + if (!inserted) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Table with name `{}` already exists in database `{}` (engine {})", + table_name, + getDatabaseName(), + getEngineName()); } bool DatabaseFilesystem::isTableExist(const String & name, ContextPtr) const @@ -62,8 +80,20 @@ StoragePtr DatabaseFilesystem::getTableImpl(const String & name, ContextPtr cont return it->second; } + // If run in Local mode, no need for path checking. + bool need_check_path = context_->getApplicationType() != Context::ApplicationType::LOCAL; + std::string user_files_path = fs::canonical(fs::path(context_->getUserFilesPath())).string(); + auto table_path = getTablePath(name); + // Check access for file before checking its existence + if (need_check_path && table_path.find(user_files_path) != 0) + throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, "File is not inside {}", user_files_path); + + // If the table doesn't exist in the tables map, check if the corresponding file exists + if (!fs::exists(table_path) || !fs::is_regular_file(table_path)) + return nullptr; + // If the file exists, create a new table using TableFunctionFile and return it. auto args = makeASTFunction("file", std::make_shared(table_path)); @@ -89,11 +119,17 @@ StoragePtr DatabaseFilesystem::tryGetTable(const String & name, ContextPtr conte { try { - return getTable(name, context_); + return getTableImpl(name, context_); } - catch (...) + catch (const Exception & e) { - return nullptr; + // Ignore exceptions thrown by TableFunctionFile and which indicate that there is no table + if (e.code() == ErrorCodes::BAD_ARGUMENTS) + return nullptr; + if (e.code() == ErrorCodes::DATABASE_ACCESS_DENIED) + return nullptr; + + throw; } } diff --git a/src/Databases/DatabasesOverlay.cpp b/src/Databases/DatabasesOverlay.cpp index 5a6a4fe5cc6..b44a9798072 100644 --- a/src/Databases/DatabasesOverlay.cpp +++ b/src/Databases/DatabasesOverlay.cpp @@ -1,11 +1,9 @@ #include +#include #include #include -#include - -#include -#include +#include #include @@ -73,15 +71,11 @@ void DatabasesOverlay::dropTable(ContextPtr context_, const String & table_name, { for (auto & db : databases) { - try + if (db->isTableExist(table_name, context_)) { db->dropTable(context_, table_name, sync); return; } - catch (...) - { - continue; - } } throw Exception( ErrorCodes::LOGICAL_ERROR, @@ -119,16 +113,8 @@ StoragePtr DatabasesOverlay::detachTable(ContextPtr context_, const String & tab StoragePtr result = nullptr; for (auto & db : databases) { - try - { - result = db->detachTable(context_, table_name); - if (result) - return result; - } - catch (...) - { - continue; - } + if (db->isTableExist(table_name, context_)) + return db->detachTable(context_, table_name); } throw Exception( ErrorCodes::LOGICAL_ERROR, @@ -212,17 +198,10 @@ void DatabasesOverlay::alterTable(ContextPtr local_context, const StorageID & ta { for (auto & db : databases) { - try + if (!db->isReadOnly() && db->isTableExist(table_id.table_name, local_context)) { - if (!db->isReadOnly()) - { - db->alterTable(local_context, table_id, metadata); - return; - } - } - catch (...) - { - continue; + db->alterTable(local_context, table_id, metadata); + return; } } throw Exception( @@ -239,8 +218,8 @@ DatabasesOverlay::getTablesForBackup(const FilterByNameFunction & filter, const std::vector> result; for (const auto & db : databases) { - auto dbBackup = db->getTablesForBackup(filter, local_context); - result.insert(result.end(), std::make_move_iterator(dbBackup.begin()), std::make_move_iterator(dbBackup.end())); + auto db_backup = db->getTablesForBackup(filter, local_context); + result.insert(result.end(), std::make_move_iterator(db_backup.begin()), std::make_move_iterator(db_backup.end())); } return result; } @@ -284,12 +263,4 @@ DatabaseTablesIteratorPtr DatabasesOverlay::getTablesIterator(ContextPtr context return std::make_unique(std::move(tables), getDatabaseName()); } -DatabasePtr CreateClickHouseLocalDatabaseOverlay(const String & name_, ContextPtr context_) -{ - auto databaseCombiner = std::make_shared(name_, context_); - databaseCombiner->registerNextDatabase(std::make_shared(name_, "", context_)); - databaseCombiner->registerNextDatabase(std::make_shared(name_, context_)); - return databaseCombiner; -} - } diff --git a/src/Databases/DatabasesOverlay.h b/src/Databases/DatabasesOverlay.h index 77f0085161b..0f31bbd6a47 100644 --- a/src/Databases/DatabasesOverlay.h +++ b/src/Databases/DatabasesOverlay.h @@ -63,6 +63,4 @@ protected: Poco::Logger * log; }; -DatabasePtr CreateClickHouseLocalDatabaseOverlay(const String & name_, ContextPtr context_); - } From 7dde282027891aa0afbd0889b4be896981354ca1 Mon Sep 17 00:00:00 2001 From: pufit Date: Thu, 27 Apr 2023 18:50:35 -0400 Subject: [PATCH 0039/1072] Fix no case completion --- src/Client/LineReader.cpp | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/Client/LineReader.cpp b/src/Client/LineReader.cpp index f6cd7bffef7..82dbe03e5d3 100644 --- a/src/Client/LineReader.cpp +++ b/src/Client/LineReader.cpp @@ -82,9 +82,13 @@ replxx::Replxx::completions_t LineReader::Suggest::getCompletions(const String & std::lock_guard lock(mutex); Words to_search; + bool no_case = false; /// Only perform case sensitive completion when the prefix string contains any uppercase characters if (std::none_of(prefix.begin(), prefix.end(), [](char32_t x) { return iswupper(static_cast(x)); })) + { to_search = words_no_case; + no_case = true; + } else to_search = words; @@ -95,11 +99,18 @@ replxx::Replxx::completions_t LineReader::Suggest::getCompletions(const String & addNewWords(to_search, new_words, std::less{}); } - range = std::equal_range( - to_search.begin(), to_search.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched) - { - return strncasecmp(s.data(), prefix_searched.data(), prefix_length) < 0; - }); + if (no_case) + range = std::equal_range( + to_search.begin(), to_search.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched) + { + return strncasecmp(s.data(), prefix_searched.data(), prefix_length) < 0; + }); + else + range = std::equal_range( + to_search.begin(), to_search.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched) + { + return strncmp(s.data(), prefix_searched.data(), prefix_length) < 0; + }); return replxx::Replxx::completions_t(range.first, range.second); } From 39ae0c1c4e4701f07846425a22d6e6c3f587b194 Mon Sep 17 00:00:00 2001 From: pufit Date: Thu, 27 Apr 2023 22:51:52 -0400 Subject: [PATCH 0040/1072] Fix tests --- tests/integration/test_keeper_client/test.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_keeper_client/test.py b/tests/integration/test_keeper_client/test.py index 64ef62b6243..4fa8ce82aa6 100644 --- a/tests/integration/test_keeper_client/test.py +++ b/tests/integration/test_keeper_client/test.py @@ -30,7 +30,10 @@ def test_base_commands(started_cluster): [ started_cluster.server_bin_path, "keeper-client", - f"{cluster.get_instance_ip('zoo1')}:{cluster.zookeeper_port}", + "--host", + cluster.get_instance_ip('zoo1'), + "--port", + cluster.zookeeper_port, "-q", "create test_create_zk_node1 testvalue1;create test_create_zk_node_2 testvalue2;get test_create_zk_node1;", ], @@ -47,7 +50,10 @@ def test_four_letter_word_commands(started_cluster): [ started_cluster.server_bin_path, "keeper-client", - f"{cluster.get_instance_ip('zoo1')}:{cluster.zookeeper_port}", + "--host", + cluster.get_instance_ip('zoo1'), + "--port", + cluster.zookeeper_port, "-q", "ruok", ], From 711d8d4442cbe8f074d39c4cb2477a761037140e Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Fri, 28 Apr 2023 03:19:21 +0000 Subject: [PATCH 0041/1072] Automatic style fix --- tests/integration/test_keeper_client/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_keeper_client/test.py b/tests/integration/test_keeper_client/test.py index 4fa8ce82aa6..f7aec7852b1 100644 --- a/tests/integration/test_keeper_client/test.py +++ b/tests/integration/test_keeper_client/test.py @@ -31,7 +31,7 @@ def test_base_commands(started_cluster): started_cluster.server_bin_path, "keeper-client", "--host", - cluster.get_instance_ip('zoo1'), + cluster.get_instance_ip("zoo1"), "--port", cluster.zookeeper_port, "-q", @@ -51,7 +51,7 @@ def test_four_letter_word_commands(started_cluster): started_cluster.server_bin_path, "keeper-client", "--host", - cluster.get_instance_ip('zoo1'), + cluster.get_instance_ip("zoo1"), "--port", cluster.zookeeper_port, "-q", From c862eca0f735433e7eb17199d21a8c8dd2d1dc07 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 28 Apr 2023 14:08:46 +0000 Subject: [PATCH 0042/1072] fix build --- src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp | 2 +- src/Interpreters/Cache/FileCache.cpp | 7 ++----- src/Interpreters/Cache/FileSegment.cpp | 15 ++++++++------- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index ddc5b385b07..b186b9b1d28 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -536,7 +536,7 @@ void CachedOnDiskReadBufferFromFile::predownload(FileSegment & file_segment) }); OpenTelemetry::SpanHolder span{ - fmt::format("CachedOnDiskReadBufferFromFile::predownload(key={}, size={})", file_segment->key().toString(), bytes_to_predownload)}; + fmt::format("CachedOnDiskReadBufferFromFile::predownload(key={}, size={})", file_segment.key().toString(), bytes_to_predownload)}; if (bytes_to_predownload) { diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp index 03cea3ee2ab..626ac7f5d49 100644 --- a/src/Interpreters/Cache/FileCache.cpp +++ b/src/Interpreters/Cache/FileCache.cpp @@ -405,11 +405,8 @@ FileSegmentsHolderPtr FileCache::set( return std::make_unique(std::move(file_segments)); } -FileSegmentsHolderPtr FileCache::getOrSet( - const Key & key, - size_t offset, - size_t size, - const CreateFileSegmentSettings & settings) +FileSegmentsHolderPtr +FileCache::getOrSet(const Key & key, size_t offset, size_t size, size_t file_size, const CreateFileSegmentSettings & settings) { assertInitialized(); diff --git a/src/Interpreters/Cache/FileSegment.cpp b/src/Interpreters/Cache/FileSegment.cpp index 91356f699e3..2b1801a46f0 100644 --- a/src/Interpreters/Cache/FileSegment.cpp +++ b/src/Interpreters/Cache/FileSegment.cpp @@ -1,13 +1,14 @@ #include "FileSegment.h" -#include -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include #include From 30bc74c859ae4a33abe0d5bb7e42369865ea3b4c Mon Sep 17 00:00:00 2001 From: pufit Date: Fri, 28 Apr 2023 14:09:24 -0400 Subject: [PATCH 0043/1072] Fix tests --- tests/integration/test_keeper_client/test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_keeper_client/test.py b/tests/integration/test_keeper_client/test.py index 4fa8ce82aa6..eee30741bdd 100644 --- a/tests/integration/test_keeper_client/test.py +++ b/tests/integration/test_keeper_client/test.py @@ -31,9 +31,9 @@ def test_base_commands(started_cluster): started_cluster.server_bin_path, "keeper-client", "--host", - cluster.get_instance_ip('zoo1'), + str(cluster.get_instance_ip('zoo1')), "--port", - cluster.zookeeper_port, + str(cluster.zookeeper_port), "-q", "create test_create_zk_node1 testvalue1;create test_create_zk_node_2 testvalue2;get test_create_zk_node1;", ], @@ -51,9 +51,9 @@ def test_four_letter_word_commands(started_cluster): started_cluster.server_bin_path, "keeper-client", "--host", - cluster.get_instance_ip('zoo1'), + str(cluster.get_instance_ip('zoo1')), "--port", - cluster.zookeeper_port, + str(cluster.zookeeper_port), "-q", "ruok", ], From e0abb251e5344b431de768e91dbf4f791283467d Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 28 Apr 2023 21:52:38 +0000 Subject: [PATCH 0044/1072] fix test --- tests/queries/0_stateless/02344_describe_cache.reference | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02344_describe_cache.reference b/tests/queries/0_stateless/02344_describe_cache.reference index 7561b32bae1..a803ca1fab1 100644 --- a/tests/queries/0_stateless/02344_describe_cache.reference +++ b/tests/queries/0_stateless/02344_describe_cache.reference @@ -1,2 +1,2 @@ -134217728 1048576 104857600 1 0 0 0 /var/lib/clickhouse/caches/s3_cache/ 0 +134217728 1048576 8388608 1 0 0 0 /var/lib/clickhouse/caches/s3_cache/ 0 134217728 1048576 104857600 0 0 0 0 /var/lib/clickhouse/caches/s3_cache_2/ 0 From 57d852a60e804da746ce5e4cde2d56222afe677e Mon Sep 17 00:00:00 2001 From: Aleksei Golub Date: Sun, 30 Apr 2023 14:46:11 +0300 Subject: [PATCH 0045/1072] Fixed table existence checking --- src/Databases/DatabaseFactory.cpp | 2 +- src/Databases/DatabaseFilesystem.cpp | 60 ++++++++++++++----- src/Databases/DatabaseFilesystem.h | 2 + .../02722_database_filesystem.reference | 5 +- .../0_stateless/02722_database_filesystem.sh | 24 ++++++-- 5 files changed, 70 insertions(+), 23 deletions(-) diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index 1be0d5dd7b2..8a50c31efc8 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -443,7 +443,7 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String if (engine->arguments && !engine->arguments->children.empty()) { if (engine->arguments->children.size() != 1) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Filesystem database requires exactly 1 argument: filesystem_path"); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Filesystem database requires at most 1 argument: filesystem_path"); const auto & arguments = engine->arguments->children; init_path = safeGetLiteralValue(arguments[0], engine_name); diff --git a/src/Databases/DatabaseFilesystem.cpp b/src/Databases/DatabaseFilesystem.cpp index 8275bdf6151..7f22b8a16a0 100644 --- a/src/Databases/DatabaseFilesystem.cpp +++ b/src/Databases/DatabaseFilesystem.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -24,19 +25,27 @@ namespace ErrorCodes extern const int UNKNOWN_TABLE; extern const int DATABASE_ACCESS_DENIED; extern const int BAD_ARGUMENTS; + extern const int FILE_DOESNT_EXIST; } DatabaseFilesystem::DatabaseFilesystem(const String & name_, const String & path_, ContextPtr context_) : IDatabase(name_), WithContext(context_->getGlobalContext()), path(path_), log(&Poco::Logger::get("DatabaseFileSystem(" + name_ + ")")) { fs::path user_files_path; - if (context_->getApplicationType() != Context::ApplicationType::LOCAL) + const auto & application_type = context_->getApplicationType(); + + if (application_type != Context::ApplicationType::LOCAL) user_files_path = fs::canonical(fs::path(getContext()->getUserFilesPath())); if (fs::path(path).is_relative()) path = user_files_path / path; + else if (application_type != Context::ApplicationType::LOCAL && !pathStartsWith(fs::path(path), user_files_path)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Path must be inside user-files path ({})", user_files_path.string()); path = fs::absolute(path).lexically_normal().string(); + + if (!fs::exists(path)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Path does not exist ({})", path); } std::string DatabaseFilesystem::getTablePath(const std::string & table_name) const @@ -58,7 +67,32 @@ void DatabaseFilesystem::addTable(const std::string & table_name, StoragePtr tab getEngineName()); } -bool DatabaseFilesystem::isTableExist(const String & name, ContextPtr) const +bool DatabaseFilesystem::checkTableFilePath(const std::string & table_path, ContextPtr context_, bool throw_on_error) const { + // If run in Local mode, no need for path checking. + bool need_check_path = context_->getApplicationType() != Context::ApplicationType::LOCAL; + std::string user_files_path = fs::canonical(fs::path(context_->getUserFilesPath())).string(); + + // Check access for file before checking its existence + if (need_check_path && !fileOrSymlinkPathStartsWith(table_path, user_files_path)) + { + if (throw_on_error) + throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, "File is not inside {}", user_files_path); + else + return false; + } + + // Check if the corresponding file exists + if (!fs::exists(table_path) || !fs::is_regular_file(table_path)) { + if (throw_on_error) + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "File does not exist ({})", table_path); + else + return false; + } + + return true; +} + +bool DatabaseFilesystem::isTableExist(const String & name, ContextPtr context_) const { { std::lock_guard lock(mutex); @@ -67,7 +101,8 @@ bool DatabaseFilesystem::isTableExist(const String & name, ContextPtr) const } fs::path table_file_path(getTablePath(name)); - return fs::exists(table_file_path) && fs::is_regular_file(table_file_path); + + return checkTableFilePath(table_file_path, context_, false); } StoragePtr DatabaseFilesystem::getTableImpl(const String & name, ContextPtr context_) const @@ -80,19 +115,9 @@ StoragePtr DatabaseFilesystem::getTableImpl(const String & name, ContextPtr cont return it->second; } - // If run in Local mode, no need for path checking. - bool need_check_path = context_->getApplicationType() != Context::ApplicationType::LOCAL; - std::string user_files_path = fs::canonical(fs::path(context_->getUserFilesPath())).string(); - auto table_path = getTablePath(name); - // Check access for file before checking its existence - if (need_check_path && table_path.find(user_files_path) != 0) - throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, "File is not inside {}", user_files_path); - - // If the table doesn't exist in the tables map, check if the corresponding file exists - if (!fs::exists(table_path) || !fs::is_regular_file(table_path)) - return nullptr; + checkTableFilePath(table_path, context_, true); // If the file exists, create a new table using TableFunctionFile and return it. auto args = makeASTFunction("file", std::make_shared(table_path)); @@ -101,6 +126,7 @@ StoragePtr DatabaseFilesystem::getTableImpl(const String & name, ContextPtr cont if (!table_function) return nullptr; + // TableFunctionFile throws exceptions, if table cannot be created auto table_storage = table_function->execute(args, context_, name); if (table_storage) addTable(name, table_storage); @@ -110,6 +136,7 @@ StoragePtr DatabaseFilesystem::getTableImpl(const String & name, ContextPtr cont StoragePtr DatabaseFilesystem::getTable(const String & name, ContextPtr context_) const { + // rethrow all exceptions from TableFunctionFile to show correct error to user if (auto storage = getTableImpl(name, context_)) return storage; throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} doesn't exist", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name)); @@ -123,11 +150,14 @@ StoragePtr DatabaseFilesystem::tryGetTable(const String & name, ContextPtr conte } catch (const Exception & e) { - // Ignore exceptions thrown by TableFunctionFile and which indicate that there is no table + // Ignore exceptions thrown by TableFunctionFile, which indicate that there is no table + // see tests/02722_database_filesystem.sh for more details if (e.code() == ErrorCodes::BAD_ARGUMENTS) return nullptr; if (e.code() == ErrorCodes::DATABASE_ACCESS_DENIED) return nullptr; + if (e.code() == ErrorCodes::FILE_DOESNT_EXIST) + return nullptr; throw; } diff --git a/src/Databases/DatabaseFilesystem.h b/src/Databases/DatabaseFilesystem.h index 697511ac5b3..3d2ad695cc6 100644 --- a/src/Databases/DatabaseFilesystem.h +++ b/src/Databases/DatabaseFilesystem.h @@ -49,6 +49,8 @@ protected: void addTable(const std::string & table_name, StoragePtr table_storage) const; + bool checkTableFilePath(const std::string & table_path, ContextPtr context_, bool throw_on_error) const; + private: String path; diff --git a/tests/queries/0_stateless/02722_database_filesystem.reference b/tests/queries/0_stateless/02722_database_filesystem.reference index a583f1e2e3c..c65dda7933a 100644 --- a/tests/queries/0_stateless/02722_database_filesystem.reference +++ b/tests/queries/0_stateless/02722_database_filesystem.reference @@ -4,7 +4,10 @@ test1 4 4 4 -Test 2: check DatabaseFilesystem access rights on server +Test 2: check DatabaseFilesystem access rights and errors handling on server +OK +OK +OK OK OK OK diff --git a/tests/queries/0_stateless/02722_database_filesystem.sh b/tests/queries/0_stateless/02722_database_filesystem.sh index 0adeface438..80f97af693e 100755 --- a/tests/queries/0_stateless/02722_database_filesystem.sh +++ b/tests/queries/0_stateless/02722_database_filesystem.sh @@ -21,6 +21,7 @@ tmp_dir=${CLICKHOUSE_TEST_UNIQUE_NAME} mkdir $tmp_dir cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${tmp_dir}/tmp.csv cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp/tmp.csv +cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp.myext ################# echo "Test 1: create filesystem database and check implicit calls" @@ -35,24 +36,35 @@ ${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`tmp/tmp.csv\`;" ${CLICKHOUSE_LOCAL} -q "SELECT COUNT(*) FROM \"${tmp_dir}/tmp.csv\"" ################# -echo "Test 2: check DatabaseFilesystem access rights on server" -# Allows list files only inside user_files +echo "Test 2: check DatabaseFilesystem access rights and errors handling on server" +# DATABASE_ACCESS_DENIED: Allows list files only inside user_files ${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`../tmp.csv\`;" 2>&1| grep -F "Code: 291" > /dev/null && echo "OK" ${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`/tmp/tmp.csv\`;" 2>&1| grep -F "Code: 291" > /dev/null && echo "OK" - ${CLICKHOUSE_CLIENT} --multiline --multiquery --query """ USE test1; SELECT COUNT(*) FROM \"../${tmp_dir}/tmp.csv\"; """ 2>&1| grep -F "Code: 291" > /dev/null && echo "OK" ${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`../../../../../../tmp.csv\`;" 2>&1| grep -F "Code: 291" > /dev/null && echo "OK" + +# BAD_ARGUMENTS: path should be inside user_files ${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ DROP DATABASE IF EXISTS test2; CREATE DATABASE test2 ENGINE = Filesystem('/tmp'); -SELECT COUNT(*) FROM test2.\`tmp.csv\`; -""" 2>&1| grep -F "Code: 291" > /dev/null && echo "OK" +""" 2>&1| grep -F "Code: 36" > /dev/null && echo "OK" + +# BAD_ARGUMENTS: .../user_files/relative_unknown_dir does not exists +${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test2; +CREATE DATABASE test2 ENGINE = Filesystem('relative_unknown_dir'); +""" 2>&1| grep -F "Code: 36" > /dev/null && echo "OK" + +# FILE_DOESNT_EXIST: unknown file +${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`tmp2.csv\`;" 2>&1| grep -F "Code: 107" > /dev/null && echo "OK" + +# BAD_ARGUMENTS: Cannot determine the file format by it's extension +${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`tmp.myext\`;" 2>&1| grep -F "Code: 36" > /dev/null && echo "OK" # Clean ${CLICKHOUSE_CLIENT} --query "DROP DATABASE test1;" -${CLICKHOUSE_CLIENT} --query "DROP DATABASE test2;" rm -rd $tmp_dir rm -rd $CLICKHOUSE_USER_FILES_PATH From 6831eb20013aadeec451ac8fb94d894abbfccef9 Mon Sep 17 00:00:00 2001 From: Aleksei Golub Date: Sun, 30 Apr 2023 14:51:04 +0300 Subject: [PATCH 0046/1072] fix style --- src/Databases/DatabaseFilesystem.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Databases/DatabaseFilesystem.cpp b/src/Databases/DatabaseFilesystem.cpp index 7f22b8a16a0..8de609f0ca2 100644 --- a/src/Databases/DatabaseFilesystem.cpp +++ b/src/Databases/DatabaseFilesystem.cpp @@ -1,6 +1,5 @@ #include -#include #include #include #include @@ -11,6 +10,7 @@ #include #include #include +#include #include @@ -67,7 +67,8 @@ void DatabaseFilesystem::addTable(const std::string & table_name, StoragePtr tab getEngineName()); } -bool DatabaseFilesystem::checkTableFilePath(const std::string & table_path, ContextPtr context_, bool throw_on_error) const { +bool DatabaseFilesystem::checkTableFilePath(const std::string & table_path, ContextPtr context_, bool throw_on_error) const +{ // If run in Local mode, no need for path checking. bool need_check_path = context_->getApplicationType() != Context::ApplicationType::LOCAL; std::string user_files_path = fs::canonical(fs::path(context_->getUserFilesPath())).string(); @@ -82,7 +83,8 @@ bool DatabaseFilesystem::checkTableFilePath(const std::string & table_path, Cont } // Check if the corresponding file exists - if (!fs::exists(table_path) || !fs::is_regular_file(table_path)) { + if (!fs::exists(table_path) || !fs::is_regular_file(table_path)) + { if (throw_on_error) throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "File does not exist ({})", table_path); else From 1846b76982828ed3223b25e2e5d6f5c8cee937eb Mon Sep 17 00:00:00 2001 From: Aleksei Golub Date: Sun, 30 Apr 2023 23:13:42 +0300 Subject: [PATCH 0047/1072] Added DatabaseS3 with test --- programs/local/LocalServer.cpp | 9 + src/Databases/DatabaseFactory.cpp | 29 ++- src/Databases/DatabaseS3.cpp | 199 ++++++++++++++++++ src/Databases/DatabaseS3.h | 63 ++++++ .../0_stateless/02724_database_s3.reference | 18 ++ .../queries/0_stateless/02724_database_s3.sh | 51 +++++ 6 files changed, 367 insertions(+), 2 deletions(-) create mode 100644 src/Databases/DatabaseS3.cpp create mode 100644 src/Databases/DatabaseS3.h create mode 100644 tests/queries/0_stateless/02724_database_s3.reference create mode 100755 tests/queries/0_stateless/02724_database_s3.sh diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 4939997b323..215a92e1944 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -51,6 +51,8 @@ #include #include +#include "config.h" + #if defined(FUZZING_MODE) #include #endif @@ -59,6 +61,10 @@ # include #endif +#if USE_AWS_S3 +#include +#endif + namespace fs = std::filesystem; @@ -155,6 +161,9 @@ static DatabasePtr createClickHouseLocalDatabaseOverlay(const String & name_, Co auto databaseCombiner = std::make_shared(name_, context_); databaseCombiner->registerNextDatabase(std::make_shared(name_, "", context_)); databaseCombiner->registerNextDatabase(std::make_shared(name_, context_)); +#if USE_AWS_S3 + databaseCombiner->registerNextDatabase(std::make_shared(name_, "", "", context_)); +#endif return databaseCombiner; } diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index 8a50c31efc8..b21435527a5 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -49,6 +49,10 @@ #include #endif +#if USE_AWS_S3 +#include +#endif + namespace fs = std::filesystem; namespace DB @@ -133,13 +137,13 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String static const std::unordered_set database_engines{"Ordinary", "Atomic", "Memory", "Dictionary", "Lazy", "Replicated", "MySQL", "MaterializeMySQL", "MaterializedMySQL", - "PostgreSQL", "MaterializedPostgreSQL", "SQLite", "Filesystem"}; + "PostgreSQL", "MaterializedPostgreSQL", "SQLite", "Filesystem", "S3"}; if (!database_engines.contains(engine_name)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Database engine name `{}` does not exist", engine_name); static const std::unordered_set engines_with_arguments{"MySQL", "MaterializeMySQL", "MaterializedMySQL", - "Lazy", "Replicated", "PostgreSQL", "MaterializedPostgreSQL", "SQLite", "Filesystem"}; + "Lazy", "Replicated", "PostgreSQL", "MaterializedPostgreSQL", "SQLite", "Filesystem", "S3"}; static const std::unordered_set engines_with_table_overrides{"MaterializeMySQL", "MaterializedMySQL", "MaterializedPostgreSQL"}; bool engine_may_have_arguments = engines_with_arguments.contains(engine_name); @@ -451,6 +455,27 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String return std::make_shared(database_name, init_path, context); } +#if USE_AWS_S3 + else if (engine_name == "S3") + { + const ASTFunction * engine = engine_define->engine; + + std::string key_id; + std::string secret_key; + + if (engine->arguments && !engine->arguments->children.empty()) + { + if (engine->arguments->children.size() != 2) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "S3 database requires 0 or 2 argument: [access_key_id, secret_access_key]"); + + const auto & arguments = engine->arguments->children; + key_id = safeGetLiteralValue(arguments[0], engine_name); + secret_key = safeGetLiteralValue(arguments[1], engine_name); + } + + return std::make_shared(database_name, key_id, secret_key, context); + } +#endif throw Exception(ErrorCodes::UNKNOWN_DATABASE_ENGINE, "Unknown database engine: {}", engine_name); } diff --git a/src/Databases/DatabaseS3.cpp b/src/Databases/DatabaseS3.cpp new file mode 100644 index 00000000000..d4412ba7973 --- /dev/null +++ b/src/Databases/DatabaseS3.cpp @@ -0,0 +1,199 @@ +#include "config.h" + +#if USE_AWS_S3 + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int UNKNOWN_TABLE; + extern const int BAD_ARGUMENTS; + extern const int FILE_DOESNT_EXIST; + extern const int UNACCEPTABLE_URL; + extern const int S3_ERROR; +} + +DatabaseS3::DatabaseS3(const String & name_, const String & key_id, const String & secret_key, ContextPtr context_) + : IDatabase(name_) + , WithContext(context_->getGlobalContext()) + , access_key_id(key_id) + , secret_access_key(secret_key) + , log(&Poco::Logger::get("DatabaseS3(" + name_ + ")")) +{ +} + +void DatabaseS3::addTable(const std::string & table_name, StoragePtr table_storage) const +{ + std::lock_guard lock(mutex); + auto [_, inserted] = loaded_tables.emplace(table_name, table_storage); + if (!inserted) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Table with name `{}` already exists in database `{}` (engine {})", + table_name, + getDatabaseName(), + getEngineName()); +} + +bool DatabaseS3::checkUrl(const std::string & url, ContextPtr context_, bool throw_on_error) const +{ + try + { + S3::URI uri(url); + context_->getGlobalContext()->getRemoteHostFilter().checkURL(uri.uri); + } + catch (...) + { + if (throw_on_error) + throw; + return false; + } + return true; +} + +bool DatabaseS3::isTableExist(const String & name, ContextPtr context_) const +{ + std::lock_guard lock(mutex); + if (loaded_tables.find(name) != loaded_tables.end()) + return true; + + return checkUrl(name, context_, false); +} + +StoragePtr DatabaseS3::getTableImpl(const String & url, ContextPtr context_) const +{ + // Check if the table exists in the loaded tables map + { + std::lock_guard lock(mutex); + auto it = loaded_tables.find(url); + if (it != loaded_tables.end()) + return it->second; + } + + checkUrl(url, context_, true); + + // call TableFunctionS3 + auto args = makeASTFunction( + "s3", + std::make_shared(url), + std::make_shared(access_key_id), + std::make_shared(secret_access_key)); + + auto table_function = TableFunctionFactory::instance().get(args, context_); + if (!table_function) + return nullptr; + + // TableFunctionS3 throws exceptions, if table cannot be created + auto table_storage = table_function->execute(args, context_, url); + if (table_storage) + addTable(url, table_storage); + + return table_storage; +} + +StoragePtr DatabaseS3::getTable(const String & name, ContextPtr context_) const +{ + // rethrow all exceptions from TableFunctionS3 to show correct error to user + if (auto storage = getTableImpl(name, context_)) + return storage; + throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} doesn't exist", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name)); +} + +StoragePtr DatabaseS3::tryGetTable(const String & name, ContextPtr context_) const +{ + try + { + return getTableImpl(name, context_); + } + catch (const Exception & e) + { + // Ignore exceptions thrown by TableFunctionS3, which indicate that there is no table + if (e.code() == ErrorCodes::BAD_ARGUMENTS) + return nullptr; + if (e.code() == ErrorCodes::S3_ERROR) + return nullptr; + if (e.code() == ErrorCodes::FILE_DOESNT_EXIST) + return nullptr; + if (e.code() == ErrorCodes::UNACCEPTABLE_URL) + return nullptr; + throw; + } + catch (const Poco::URISyntaxException &) + { + return nullptr; + } +} + +ASTPtr DatabaseS3::getCreateDatabaseQuery() const +{ + auto settings = getContext()->getSettingsRef(); + ParserCreateQuery parser; + + const String query = fmt::format("CREATE DATABASE {} ENGINE = S3('{}', '{}')", + backQuoteIfNeed(getDatabaseName()), + access_key_id, + secret_access_key); + ASTPtr ast = parseQuery(parser, query.data(), query.data() + query.size(), "", 0, settings.max_parser_depth); + + if (const auto database_comment = getDatabaseComment(); !database_comment.empty()) + { + auto & ast_create_query = ast->as(); + ast_create_query.set(ast_create_query.comment, std::make_shared(database_comment)); + } + + return ast; +} + +void DatabaseS3::shutdown() +{ + Tables tables_snapshot; + { + std::lock_guard lock(mutex); + tables_snapshot = loaded_tables; + } + + for (const auto & kv : tables_snapshot) + { + auto table_id = kv.second->getStorageID(); + kv.second->flushAndShutdown(); + } + + std::lock_guard lock(mutex); + loaded_tables.clear(); +} + +/** + * Returns an empty vector because the database is read-only and no tables can be backed up + */ +std::vector> DatabaseS3::getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const +{ + return {}; +} + +/** + * + * Returns an empty iterator because the database does not have its own tables + * But only caches them for quick access + */ +DatabaseTablesIteratorPtr DatabaseS3::getTablesIterator(ContextPtr, const FilterByNameFunction &) const +{ + return std::make_unique(Tables{}, getDatabaseName()); +} + +} // DB + +#endif diff --git a/src/Databases/DatabaseS3.h b/src/Databases/DatabaseS3.h new file mode 100644 index 00000000000..d5269e57f5a --- /dev/null +++ b/src/Databases/DatabaseS3.h @@ -0,0 +1,63 @@ +#pragma once + +#include "config.h" + +#if USE_AWS_S3 + +#include +#include +#include +#include +#include + +namespace DB +{ + +class Context; + +/** + * DatabaseS3 provides access to data stored in S3 + * Uses TableFunctionS3 to implicitly load file when a user requests the table, and provides read-only access to the data in the file + * Tables are cached inside the database for quick access + */ +class DatabaseS3 : public IDatabase, protected WithContext +{ +public: + DatabaseS3(const String & name, const String & key_id, const String & secret_key, ContextPtr context); + + String getEngineName() const override { return "S3"; } + + bool isTableExist(const String & name, ContextPtr context) const override; + + StoragePtr getTable(const String & name, ContextPtr context) const override; + + StoragePtr tryGetTable(const String & name, ContextPtr context) const override; + + bool empty() const override { return true; } + + bool isReadOnly() const override { return true; } + + ASTPtr getCreateDatabaseQuery() const override; + + void shutdown() override; + + std::vector> getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const override; + DatabaseTablesIteratorPtr getTablesIterator(ContextPtr, const FilterByNameFunction &) const override; + +protected: + StoragePtr getTableImpl(const String & url, ContextPtr context) const; + + void addTable(const std::string & table_name, StoragePtr table_storage) const; + + bool checkUrl(const std::string & url, ContextPtr context_, bool throw_on_error) const; + +private: + const String access_key_id; + const String secret_access_key; + mutable Tables loaded_tables TSA_GUARDED_BY(mutex); + Poco::Logger * log; +}; + +} // DB + +#endif diff --git a/tests/queries/0_stateless/02724_database_s3.reference b/tests/queries/0_stateless/02724_database_s3.reference new file mode 100644 index 00000000000..8a985913ff9 --- /dev/null +++ b/tests/queries/0_stateless/02724_database_s3.reference @@ -0,0 +1,18 @@ +Test 1: select from s3 +1 2 3 +4 5 6 +7 8 9 +0 0 0 +test1 +10 11 12 +13 14 15 +16 17 18 +0 0 0 +20 21 22 +23 24 25 +26 27 28 +0 0 0 +Test 2: check exceptions +OK +OK +OK diff --git a/tests/queries/0_stateless/02724_database_s3.sh b/tests/queries/0_stateless/02724_database_s3.sh new file mode 100755 index 00000000000..4f9df402040 --- /dev/null +++ b/tests/queries/0_stateless/02724_database_s3.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# Tags: no-fasttest +# Tag no-fasttest: Depends on AWS + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +################# +echo "Test 1: select from s3" +${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test1; +CREATE DATABASE test1 ENGINE = S3; +USE test1; +SELECT * FROM \"http://localhost:11111/test/a.tsv\" +""" +${CLICKHOUSE_CLIENT} -q "SHOW DATABASES;" | grep test1 +${CLICKHOUSE_CLIENT} -q "DROP DATABASE test1;" + +${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test2; +CREATE DATABASE test2 ENGINE = S3('test', 'testtest'); +USE test2; +SELECT * FROM \"http://localhost:11111/test/b.tsv\" +""" +${CLICKHOUSE_CLIENT} -q "DROP DATABASE test2;" + +${CLICKHOUSE_LOCAL} --query "SELECT * FROM \"http://localhost:11111/test/c.tsv\"" + +################# +echo "Test 2: check exceptions" +${CLICKHOUSE_LOCAL} --query "SELECT * FROM \"http://localhost:11111/test/c.myext\"" 2>&1| grep -F "UNKNOWN_TABLE" > /dev/null && echo "OK" + +${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test3; +CREATE DATABASE test3 ENGINE = S3; +USE test3; +SELECT * FROM \"http://localhost:11111/test/a.myext\" +""" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK" + +${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ +USE test3; +SELECT * FROM \"abacaba\" +""" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK" + +# Cleanup +${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test1; +DROP DATABASE IF EXISTS test2; +DROP DATABASE IF EXISTS test3; +""" From 3d1affbddb3de6c464f05459c1e9e5f34b6ff957 Mon Sep 17 00:00:00 2001 From: Aleksei Golub Date: Mon, 1 May 2023 12:17:10 +0300 Subject: [PATCH 0048/1072] retrigger checks From c1c69553741af4789170590f8a669d17f2dffbeb Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Wed, 26 Apr 2023 14:06:22 +0200 Subject: [PATCH 0049/1072] Deprecate delete-on-destroy.txt, do not create it any more --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 17 +---------------- src/Storages/MergeTree/IMergeTreeDataPart.h | 7 +++++-- src/Storages/MergeTree/MergeTreeData.cpp | 7 +++---- 3 files changed, 9 insertions(+), 22 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 148cbf93948..d7f2f3ca7c7 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -953,24 +953,9 @@ void IMergeTreeDataPart::writeVersionMetadata(const VersionMetadata & version_, } } -void IMergeTreeDataPart::writeDeleteOnDestroyMarker() -{ - static constexpr auto marker_path = "delete-on-destroy.txt"; - - try - { - getDataPartStorage().createFile(marker_path); - } - catch (Poco::Exception & e) - { - LOG_ERROR(storage.log, "{} (while creating DeleteOnDestroy marker: {})", - e.what(), (fs::path(getDataPartStorage().getFullPath()) / marker_path).string()); - } -} - void IMergeTreeDataPart::removeDeleteOnDestroyMarker() { - getDataPartStorage().removeFileIfExists("delete-on-destroy.txt"); + getDataPartStorage().removeFileIfExists(DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED); } void IMergeTreeDataPart::removeVersionMetadata() diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index a36634d2cf9..f7bcaa263d6 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -381,7 +381,8 @@ public: /// default will be stored in this file. static inline constexpr auto DEFAULT_COMPRESSION_CODEC_FILE_NAME = "default_compression_codec.txt"; - static inline constexpr auto DELETE_ON_DESTROY_MARKER_FILE_NAME = "delete-on-destroy.txt"; + /// "delete-on-destroy.txt" is deprecated. It is no longer being created, only is removed. + static inline constexpr auto DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED = "delete-on-destroy.txt"; static inline constexpr auto UUID_FILE_NAME = "uuid.txt"; @@ -456,8 +457,10 @@ public: void writeChecksums(const MergeTreeDataPartChecksums & checksums_, const WriteSettings & settings); - void writeDeleteOnDestroyMarker(); + /// "delete-on-destroy.txt" is deprecated. It is no longer being created, only is removed. + /// TODO: remove this method after some time. void removeDeleteOnDestroyMarker(); + /// It may look like a stupid joke. but these two methods are absolutely unrelated. /// This one is about removing file with metadata about part version (for transactions) void removeVersionMetadata(); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 83f5c0d359c..2def6fb08d3 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1214,7 +1214,7 @@ MergeTreeData::LoadPartResult MergeTreeData::loadDataPart( .build(); String part_path = fs::path(relative_data_path) / part_name; - String marker_path = fs::path(part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME; + String marker_path = fs::path(part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED; if (part_disk_ptr->exists(marker_path)) { @@ -4410,7 +4410,6 @@ void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy) /// All other locks are taken in StorageReplicatedMergeTree lockSharedData(*part_copy); - asMutableDeletingPart(original_active_part)->writeDeleteOnDestroyMarker(); return; } } @@ -7174,7 +7173,7 @@ std::pair MergeTreeData::cloneAn for (auto it = src_part->getDataPartStorage().iterate(); it->isValid(); it->next()) { if (!files_to_copy_instead_of_hardlinks.contains(it->name()) - && it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME + && it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME) { hardlinked_files->hardlinks_from_source_part.insert(it->name()); @@ -7189,7 +7188,7 @@ std::pair MergeTreeData::cloneAn { auto file_name_with_projection_prefix = fs::path(projection_storage.getPartDirectory()) / it->name(); if (!files_to_copy_instead_of_hardlinks.contains(file_name_with_projection_prefix) - && it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME + && it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME) { hardlinked_files->hardlinks_from_source_part.insert(file_name_with_projection_prefix); From 64d232f1aa584b3eba5abf9fe02bfa9b0535701c Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Mon, 1 May 2023 18:00:26 +0000 Subject: [PATCH 0050/1072] Fix memory leak --- src/Interpreters/DatabaseCatalog.cpp | 2 +- tests/queries/0_stateless/02724_database_s3.reference | 6 +++--- tests/queries/0_stateless/02724_database_s3.sh | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index f9e74fadcbd..129323cd6b3 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -346,7 +346,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl( catch (const Exception & e) { if (exception) - exception->emplace(*e.clone()); + exception->emplace(e); } if (!table) diff --git a/tests/queries/0_stateless/02724_database_s3.reference b/tests/queries/0_stateless/02724_database_s3.reference index 8a985913ff9..b3800a27305 100644 --- a/tests/queries/0_stateless/02724_database_s3.reference +++ b/tests/queries/0_stateless/02724_database_s3.reference @@ -8,9 +8,9 @@ test1 13 14 15 16 17 18 0 0 0 -20 21 22 -23 24 25 -26 27 28 +10 11 12 +13 14 15 +16 17 18 0 0 0 Test 2: check exceptions OK diff --git a/tests/queries/0_stateless/02724_database_s3.sh b/tests/queries/0_stateless/02724_database_s3.sh index 4f9df402040..9b539407884 100755 --- a/tests/queries/0_stateless/02724_database_s3.sh +++ b/tests/queries/0_stateless/02724_database_s3.sh @@ -25,7 +25,7 @@ SELECT * FROM \"http://localhost:11111/test/b.tsv\" """ ${CLICKHOUSE_CLIENT} -q "DROP DATABASE test2;" -${CLICKHOUSE_LOCAL} --query "SELECT * FROM \"http://localhost:11111/test/c.tsv\"" +${CLICKHOUSE_LOCAL} --query "SELECT * FROM \"http://localhost:11111/test/b.tsv\"" ################# echo "Test 2: check exceptions" From 979b68a488c88bad53a44e4c9a8c525572091de5 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Mon, 1 May 2023 18:24:30 +0000 Subject: [PATCH 0051/1072] fix --- src/Common/OpenTelemetryTraceContext.cpp | 2 +- src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp | 7 +------ src/Storages/MergeTree/MergeTreeReadPool.cpp | 3 ++- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index 0d89c581318..b68795814fb 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -120,7 +120,7 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) this->start_time_us = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); - /// Add new initialization here + this->addAttribute("clickhouse.thread_id", getThreadId()); } catch (...) { diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index b186b9b1d28..de3edf29086 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -152,11 +152,6 @@ CachedOnDiskReadBufferFromFile::getCacheReadBuffer(const FileSegment & file_segm if (use_external_buffer) local_read_settings.local_fs_buffer_size = 0; - // The buffer will unnecessarily allocate a Memory of size local_fs_buffer_size, which will then - // most likely be unused because we're swap()ping our own internal_buffer into - // implementation_buffer before each read. But we can't just set local_fs_buffer_size = 0 here - // because some buffer implementations actually use that memory (e.g. for prefetching). - auto buf = createReadBufferFromFileBase(path, local_read_settings); if (getFileSizeFromReadBuffer(*buf) == 0) @@ -827,7 +822,7 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep() } } - if (use_external_buffer && initialized) + if (use_external_buffer && !internal_buffer.empty()) internal_buffer.resize(original_buffer_size); chassert(!file_segment.isDownloader()); diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index 931a1be6b30..b3e3cf1361e 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -71,7 +71,8 @@ MergeTreeReadPool::MergeTreeReadPool( { const auto min_bytes_per_task = settings.merge_tree_min_bytes_per_task_for_remote_reading; const auto avg_mark_bytes = std::max(total_compressed_bytes / total_marks, 1); - const auto heuristic_min_marks = std::min(total_marks / threads_, min_bytes_per_task / avg_mark_bytes); + /// We're taking min here because number of tasks shouldn't be too low - it will make task stealing impossible. + const auto heuristic_min_marks = std::min(total_marks / threads_ / 8, min_bytes_per_task / avg_mark_bytes); if (heuristic_min_marks > min_marks_for_concurrent_read) { min_marks_for_concurrent_read = heuristic_min_marks; From 95522ad7a6486bdbe5861c4f65c3a0ffe9610372 Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Mon, 1 May 2023 21:46:17 +0000 Subject: [PATCH 0052/1072] Added DatabaseHDFS --- programs/local/LocalServer.cpp | 7 + src/Databases/DatabaseFactory.cpp | 31 ++- src/Databases/DatabaseHDFS.cpp | 228 ++++++++++++++++++ src/Databases/DatabaseHDFS.h | 65 +++++ .../0_stateless/02725_database_hdfs.reference | 16 ++ .../0_stateless/02725_database_hdfs.sh | 66 +++++ 6 files changed, 411 insertions(+), 2 deletions(-) create mode 100644 src/Databases/DatabaseHDFS.cpp create mode 100644 src/Databases/DatabaseHDFS.h create mode 100644 tests/queries/0_stateless/02725_database_hdfs.reference create mode 100755 tests/queries/0_stateless/02725_database_hdfs.sh diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 215a92e1944..0cf94892171 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -65,6 +65,10 @@ #include #endif +#if USE_HDFS +#include +#endif + namespace fs = std::filesystem; @@ -163,6 +167,9 @@ static DatabasePtr createClickHouseLocalDatabaseOverlay(const String & name_, Co databaseCombiner->registerNextDatabase(std::make_shared(name_, context_)); #if USE_AWS_S3 databaseCombiner->registerNextDatabase(std::make_shared(name_, "", "", context_)); +#endif +#if USE_HDFS + databaseCombiner->registerNextDatabase(std::make_shared(name_, "", context_)); #endif return databaseCombiner; } diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index b21435527a5..5c4256c8a9f 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -53,6 +53,10 @@ #include #endif +#if USE_HDFS +#include +#endif + namespace fs = std::filesystem; namespace DB @@ -137,13 +141,13 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String static const std::unordered_set database_engines{"Ordinary", "Atomic", "Memory", "Dictionary", "Lazy", "Replicated", "MySQL", "MaterializeMySQL", "MaterializedMySQL", - "PostgreSQL", "MaterializedPostgreSQL", "SQLite", "Filesystem", "S3"}; + "PostgreSQL", "MaterializedPostgreSQL", "SQLite", "Filesystem", "S3", "HDFS"}; if (!database_engines.contains(engine_name)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Database engine name `{}` does not exist", engine_name); static const std::unordered_set engines_with_arguments{"MySQL", "MaterializeMySQL", "MaterializedMySQL", - "Lazy", "Replicated", "PostgreSQL", "MaterializedPostgreSQL", "SQLite", "Filesystem", "S3"}; + "Lazy", "Replicated", "PostgreSQL", "MaterializedPostgreSQL", "SQLite", "Filesystem", "S3", "HDFS"}; static const std::unordered_set engines_with_table_overrides{"MaterializeMySQL", "MaterializedMySQL", "MaterializedPostgreSQL"}; bool engine_may_have_arguments = engines_with_arguments.contains(engine_name); @@ -437,6 +441,7 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String return std::make_shared(context, engine_define, create.attach, database_path); } #endif + else if (engine_name == "Filesystem") { const ASTFunction * engine = engine_define->engine; @@ -455,6 +460,7 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String return std::make_shared(database_name, init_path, context); } + #if USE_AWS_S3 else if (engine_name == "S3") { @@ -477,6 +483,27 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String } #endif +#if USE_HDFS + else if (engine_name == "HDFS") + { + const ASTFunction * engine = engine_define->engine; + + /// If source_url is empty, then table name must contain full url + std::string source_url; + + if (engine->arguments && !engine->arguments->children.empty()) + { + if (engine->arguments->children.size() != 1) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "HDFS database requires at most 1 argument: source_url"); + + const auto & arguments = engine->arguments->children; + source_url = safeGetLiteralValue(arguments[0], engine_name); + } + + return std::make_shared(database_name, source_url, context); + } +#endif + throw Exception(ErrorCodes::UNKNOWN_DATABASE_ENGINE, "Unknown database engine: {}", engine_name); } diff --git a/src/Databases/DatabaseHDFS.cpp b/src/Databases/DatabaseHDFS.cpp new file mode 100644 index 00000000000..39c3f955bf5 --- /dev/null +++ b/src/Databases/DatabaseHDFS.cpp @@ -0,0 +1,228 @@ +#include "config.h" + +#if USE_HDFS + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace fs = std::filesystem; + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int UNKNOWN_TABLE; + extern const int BAD_ARGUMENTS; + extern const int FILE_DOESNT_EXIST; + extern const int UNACCEPTABLE_URL; + extern const int ACCESS_DENIED; + extern const int DATABASE_ACCESS_DENIED; + extern const int HDFS_ERROR; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; +} + +static constexpr std::string_view HDFS_HOST_REGEXP = "^hdfs://[^/]*"; + + +DatabaseHDFS::DatabaseHDFS(const String & name_, const String & source_url, ContextPtr context_) + : IDatabase(name_) + , WithContext(context_->getGlobalContext()) + , source(source_url) + , log(&Poco::Logger::get("DatabaseHDFS(" + name_ + ")")) +{ + if (!source.empty()) + { + if (!re2::RE2::FullMatch(source, std::string(HDFS_HOST_REGEXP))) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad hdfs host: {}. It should have structure 'hdfs://:'", source); + context_->getGlobalContext()->getRemoteHostFilter().checkURL(Poco::URI(source)); + } +} + +void DatabaseHDFS::addTable(const std::string & table_name, StoragePtr table_storage) const +{ + std::lock_guard lock(mutex); + auto [_, inserted] = loaded_tables.emplace(table_name, table_storage); + if (!inserted) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Table with name `{}` already exists in database `{}` (engine {})", + table_name, + getDatabaseName(), + getEngineName()); +} + +std::string DatabaseHDFS::getTablePath(const std::string & table_name) const +{ + if (table_name.starts_with("hdfs://")) + return table_name; + if (source.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad hdfs url: {}. It should have structure 'hdfs://:/path'", table_name); + return (fs::path(source) / table_name).string(); +} + +bool DatabaseHDFS::checkUrl(const std::string & url, ContextPtr context_, bool throw_on_error) const +{ + try + { + checkHDFSURL(url); + context_->getGlobalContext()->getRemoteHostFilter().checkURL(Poco::URI(url)); + } + catch (...) + { + if (throw_on_error) + throw; + return false; + } + + return true; +} + +bool DatabaseHDFS::isTableExist(const String & name, ContextPtr context_) const +{ + std::lock_guard lock(mutex); + if (loaded_tables.find(name) != loaded_tables.end()) + return true; + + return checkUrl(name, context_, false); +} + +StoragePtr DatabaseHDFS::getTableImpl(const String & name, ContextPtr context_) const +{ + // Check if the table exists in the loaded tables map + { + std::lock_guard lock(mutex); + auto it = loaded_tables.find(name); + if (it != loaded_tables.end()) + return it->second; + } + + auto url = getTablePath(name); + + checkUrl(url, context_, true); + + // call TableFunctionHDFS + auto args = makeASTFunction("hdfs", std::make_shared(url)); + + auto table_function = TableFunctionFactory::instance().get(args, context_); + if (!table_function) + return nullptr; + + // TableFunctionHDFS throws exceptions, if table cannot be created + auto table_storage = table_function->execute(args, context_, name); + if (table_storage) + addTable(name, table_storage); + + return table_storage; +} + +StoragePtr DatabaseHDFS::getTable(const String & name, ContextPtr context_) const +{ + // rethrow all exceptions from TableFunctionHDFS to show correct error to user + if (auto storage = getTableImpl(name, context_)) + return storage; + throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} doesn't exist", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name)); +} + +StoragePtr DatabaseHDFS::tryGetTable(const String & name, ContextPtr context_) const +{ + try + { + return getTableImpl(name, context_); + } + catch (const Exception & e) + { + // Ignore exceptions thrown by TableFunctionHDFS, which indicate that there is no table + if (e.code() == ErrorCodes::BAD_ARGUMENTS) + return nullptr; + if (e.code() == ErrorCodes::ACCESS_DENIED) + return nullptr; + if (e.code() == ErrorCodes::DATABASE_ACCESS_DENIED) + return nullptr; + if (e.code() == ErrorCodes::FILE_DOESNT_EXIST) + return nullptr; + if (e.code() == ErrorCodes::UNACCEPTABLE_URL) + return nullptr; + if (e.code() == ErrorCodes::HDFS_ERROR) + return nullptr; + if (e.code() == ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE) + return nullptr; + throw; + } + catch (const Poco::URISyntaxException &) + { + return nullptr; + } +} + +ASTPtr DatabaseHDFS::getCreateDatabaseQuery() const +{ + auto settings = getContext()->getSettingsRef(); + ParserCreateQuery parser; + + const String query = fmt::format("CREATE DATABASE {} ENGINE = HDFS('{}')", backQuoteIfNeed(getDatabaseName()), source); + ASTPtr ast = parseQuery(parser, query.data(), query.data() + query.size(), "", 0, settings.max_parser_depth); + + if (const auto database_comment = getDatabaseComment(); !database_comment.empty()) + { + auto & ast_create_query = ast->as(); + ast_create_query.set(ast_create_query.comment, std::make_shared(database_comment)); + } + + return ast; +} + +void DatabaseHDFS::shutdown() +{ + Tables tables_snapshot; + { + std::lock_guard lock(mutex); + tables_snapshot = loaded_tables; + } + + for (const auto & kv : tables_snapshot) + { + auto table_id = kv.second->getStorageID(); + kv.second->flushAndShutdown(); + } + + std::lock_guard lock(mutex); + loaded_tables.clear(); +} + +/** + * Returns an empty vector because the database is read-only and no tables can be backed up + */ +std::vector> DatabaseHDFS::getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const +{ + return {}; +} + +/** + * + * Returns an empty iterator because the database does not have its own tables + * But only caches them for quick access + */ +DatabaseTablesIteratorPtr DatabaseHDFS::getTablesIterator(ContextPtr, const FilterByNameFunction &) const +{ + return std::make_unique(Tables{}, getDatabaseName()); +} + +} // DB + +#endif diff --git a/src/Databases/DatabaseHDFS.h b/src/Databases/DatabaseHDFS.h new file mode 100644 index 00000000000..4e2b8578fcd --- /dev/null +++ b/src/Databases/DatabaseHDFS.h @@ -0,0 +1,65 @@ +#pragma once + +#include "config.h" + +#if USE_HDFS + +#include +#include +#include +#include +#include + +namespace DB +{ + +class Context; + +/** + * DatabaseHDFS allows to interact with files stored on the file system + * Uses TableFunctionHDFS to implicitly load file when a user requests the table, and provides read-only access to the data in the file + * Tables are cached inside the database for quick access + */ +class DatabaseHDFS : public IDatabase, protected WithContext +{ +public: + DatabaseHDFS(const String & name, const String & source_url, ContextPtr context); + + String getEngineName() const override { return "S3"; } + + bool isTableExist(const String & name, ContextPtr context) const override; + + StoragePtr getTable(const String & name, ContextPtr context) const override; + + StoragePtr tryGetTable(const String & name, ContextPtr context) const override; + + bool empty() const override { return true; } + + bool isReadOnly() const override { return true; } + + ASTPtr getCreateDatabaseQuery() const override; + + void shutdown() override; + + std::vector> getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const override; + DatabaseTablesIteratorPtr getTablesIterator(ContextPtr, const FilterByNameFunction &) const override; + +protected: + StoragePtr getTableImpl(const String & url, ContextPtr context) const; + + void addTable(const std::string & table_name, StoragePtr table_storage) const; + + bool checkUrl(const std::string & name, ContextPtr context_, bool throw_on_error) const; + + std::string getTablePath(const std::string & table_name) const; + +private: + const String source; + + mutable Tables loaded_tables TSA_GUARDED_BY(mutex); + Poco::Logger * log; +}; + +} // DB + +#endif diff --git a/tests/queries/0_stateless/02725_database_hdfs.reference b/tests/queries/0_stateless/02725_database_hdfs.reference new file mode 100644 index 00000000000..2a2e6c20aaa --- /dev/null +++ b/tests/queries/0_stateless/02725_database_hdfs.reference @@ -0,0 +1,16 @@ +Test 1: select from hdfs database +1 2 3 +test1 +1 2 3 +test2 +4 5 6 +Test 2: check exceptions +OK0 +OK1 +OK2 +OK3 +OK4 +OK5 +OK6 +OK7 +OK8 diff --git a/tests/queries/0_stateless/02725_database_hdfs.sh b/tests/queries/0_stateless/02725_database_hdfs.sh new file mode 100755 index 00000000000..ea16dd4024c --- /dev/null +++ b/tests/queries/0_stateless/02725_database_hdfs.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, use-hdfs + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# Prepare data +${CLICKHOUSE_CLIENT} -q "insert into table function hdfs('hdfs://localhost:12222/test_02725_1.tsv', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32') select 1, 2, 3 settings hdfs_truncate_on_insert=1;" +${CLICKHOUSE_CLIENT} -q "insert into table function hdfs('hdfs://localhost:12222/test_02725_2.tsv', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32') select 4, 5, 6 settings hdfs_truncate_on_insert=1;" + +################# +echo "Test 1: select from hdfs database" + +# Database without specific host +${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test1; +CREATE DATABASE test1 ENGINE = HDFS; +USE test1; +SELECT * FROM \"hdfs://localhost:12222/test_02725_1.tsv\" +""" +${CLICKHOUSE_CLIENT} -q "SHOW DATABASES;" | grep test1 + +# Database with host +${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test2; +CREATE DATABASE test2 ENGINE = HDFS('hdfs://localhost:12222'); +USE test2; +SELECT * FROM \"test_02725_1.tsv\" +""" +${CLICKHOUSE_CLIENT} -q "SHOW DATABASES;" | grep test2 + +# Check implicit call in clickhouse-local +${CLICKHOUSE_LOCAL} --query "SELECT * FROM \"hdfs://localhost:12222/test_02725_2.tsv\"" + +################# +echo "Test 2: check exceptions" +${CLICKHOUSE_LOCAL} --query "SELECT * FROM \"hdfs://localhost:12222/file.myext\"" 2>&1| grep -F "UNKNOWN_TABLE" > /dev/null && echo "OK0" +${CLICKHOUSE_LOCAL} --query "SELECT * FROM \"hdfs://localhost:12222/test_02725_3.tsv\"" 2>&1| grep -F "UNKNOWN_TABLE" > /dev/null && echo "OK1" +${CLICKHOUSE_LOCAL} --query "SELECT * FROM \"hdfs://localhost:12222\"" 2>&1| grep -F "UNKNOWN_TABLE" > /dev/null && echo "OK2" + +${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test3; +CREATE DATABASE test3 ENGINE = HDFS('abacaba'); +""" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK3" + +${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test4; +CREATE DATABASE test4 ENGINE = HDFS; +USE test4; +SELECT * FROM \"abacaba/file.tsv\" +""" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK4" + +${CLICKHOUSE_CLIENT} -q "SELECT * FROM test4.\`http://localhost:11111/test/a.tsv\`" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK5" +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test4.\`hdfs://localhost:12222/file.myext\`" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK6" +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test4.\`hdfs://localhost:12222/test_02725_3.tsv\`" 2>&1| grep -F "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK7" +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test4.\`hdfs://localhost:12222\`" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK8" + + +# Cleanup +${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test1; +DROP DATABASE IF EXISTS test2; +DROP DATABASE IF EXISTS test3; +DROP DATABASE IF EXISTS test4; +""" \ No newline at end of file From 82bb1e8bf2a3183179938629cc8f6aab3d876e87 Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Tue, 2 May 2023 18:51:35 +0000 Subject: [PATCH 0053/1072] Fix build and try fix tests --- src/Databases/DatabaseHDFS.h | 4 ++-- tests/queries/0_stateless/02724_database_s3.sh | 2 +- tests/queries/0_stateless/02725_database_hdfs.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Databases/DatabaseHDFS.h b/src/Databases/DatabaseHDFS.h index 4e2b8578fcd..9a506c5c8ac 100644 --- a/src/Databases/DatabaseHDFS.h +++ b/src/Databases/DatabaseHDFS.h @@ -45,11 +45,11 @@ public: DatabaseTablesIteratorPtr getTablesIterator(ContextPtr, const FilterByNameFunction &) const override; protected: - StoragePtr getTableImpl(const String & url, ContextPtr context) const; + StoragePtr getTableImpl(const String & name, ContextPtr context) const; void addTable(const std::string & table_name, StoragePtr table_storage) const; - bool checkUrl(const std::string & name, ContextPtr context_, bool throw_on_error) const; + bool checkUrl(const std::string & url, ContextPtr context_, bool throw_on_error) const; std::string getTablePath(const std::string & table_name) const; diff --git a/tests/queries/0_stateless/02724_database_s3.sh b/tests/queries/0_stateless/02724_database_s3.sh index 9b539407884..af858d140d7 100755 --- a/tests/queries/0_stateless/02724_database_s3.sh +++ b/tests/queries/0_stateless/02724_database_s3.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-fasttest +# Tags: no-fasttest, no-parallel # Tag no-fasttest: Depends on AWS CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) diff --git a/tests/queries/0_stateless/02725_database_hdfs.sh b/tests/queries/0_stateless/02725_database_hdfs.sh index ea16dd4024c..8d4e982504a 100755 --- a/tests/queries/0_stateless/02725_database_hdfs.sh +++ b/tests/queries/0_stateless/02725_database_hdfs.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-fasttest, use-hdfs +# Tags: no-fasttest, use-hdfs, no-parallel CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From 58cb6c7837872ae4eb46eed84d5aa0d75607d661 Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Tue, 2 May 2023 19:57:36 +0000 Subject: [PATCH 0054/1072] S3, HDFS only for explicit creation --- programs/local/LocalServer.cpp | 14 -------------- .../0_stateless/02724_database_s3.reference | 5 ----- tests/queries/0_stateless/02724_database_s3.sh | 4 ---- .../0_stateless/02725_database_hdfs.reference | 4 ---- .../queries/0_stateless/02725_database_hdfs.sh | 18 ++++++------------ 5 files changed, 6 insertions(+), 39 deletions(-) diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 0cf94892171..b413483686a 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -61,14 +61,6 @@ # include #endif -#if USE_AWS_S3 -#include -#endif - -#if USE_HDFS -#include -#endif - namespace fs = std::filesystem; @@ -165,12 +157,6 @@ static DatabasePtr createClickHouseLocalDatabaseOverlay(const String & name_, Co auto databaseCombiner = std::make_shared(name_, context_); databaseCombiner->registerNextDatabase(std::make_shared(name_, "", context_)); databaseCombiner->registerNextDatabase(std::make_shared(name_, context_)); -#if USE_AWS_S3 - databaseCombiner->registerNextDatabase(std::make_shared(name_, "", "", context_)); -#endif -#if USE_HDFS - databaseCombiner->registerNextDatabase(std::make_shared(name_, "", context_)); -#endif return databaseCombiner; } diff --git a/tests/queries/0_stateless/02724_database_s3.reference b/tests/queries/0_stateless/02724_database_s3.reference index b3800a27305..72ba0e240b1 100644 --- a/tests/queries/0_stateless/02724_database_s3.reference +++ b/tests/queries/0_stateless/02724_database_s3.reference @@ -8,11 +8,6 @@ test1 13 14 15 16 17 18 0 0 0 -10 11 12 -13 14 15 -16 17 18 -0 0 0 Test 2: check exceptions OK OK -OK diff --git a/tests/queries/0_stateless/02724_database_s3.sh b/tests/queries/0_stateless/02724_database_s3.sh index af858d140d7..2758580a355 100755 --- a/tests/queries/0_stateless/02724_database_s3.sh +++ b/tests/queries/0_stateless/02724_database_s3.sh @@ -25,12 +25,8 @@ SELECT * FROM \"http://localhost:11111/test/b.tsv\" """ ${CLICKHOUSE_CLIENT} -q "DROP DATABASE test2;" -${CLICKHOUSE_LOCAL} --query "SELECT * FROM \"http://localhost:11111/test/b.tsv\"" - ################# echo "Test 2: check exceptions" -${CLICKHOUSE_LOCAL} --query "SELECT * FROM \"http://localhost:11111/test/c.myext\"" 2>&1| grep -F "UNKNOWN_TABLE" > /dev/null && echo "OK" - ${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ DROP DATABASE IF EXISTS test3; CREATE DATABASE test3 ENGINE = S3; diff --git a/tests/queries/0_stateless/02725_database_hdfs.reference b/tests/queries/0_stateless/02725_database_hdfs.reference index 2a2e6c20aaa..ef8adae2bbc 100644 --- a/tests/queries/0_stateless/02725_database_hdfs.reference +++ b/tests/queries/0_stateless/02725_database_hdfs.reference @@ -3,7 +3,6 @@ Test 1: select from hdfs database test1 1 2 3 test2 -4 5 6 Test 2: check exceptions OK0 OK1 @@ -11,6 +10,3 @@ OK2 OK3 OK4 OK5 -OK6 -OK7 -OK8 diff --git a/tests/queries/0_stateless/02725_database_hdfs.sh b/tests/queries/0_stateless/02725_database_hdfs.sh index 8d4e982504a..a78f3e6bbdc 100755 --- a/tests/queries/0_stateless/02725_database_hdfs.sh +++ b/tests/queries/0_stateless/02725_database_hdfs.sh @@ -30,31 +30,25 @@ SELECT * FROM \"test_02725_1.tsv\" """ ${CLICKHOUSE_CLIENT} -q "SHOW DATABASES;" | grep test2 -# Check implicit call in clickhouse-local -${CLICKHOUSE_LOCAL} --query "SELECT * FROM \"hdfs://localhost:12222/test_02725_2.tsv\"" - ################# echo "Test 2: check exceptions" -${CLICKHOUSE_LOCAL} --query "SELECT * FROM \"hdfs://localhost:12222/file.myext\"" 2>&1| grep -F "UNKNOWN_TABLE" > /dev/null && echo "OK0" -${CLICKHOUSE_LOCAL} --query "SELECT * FROM \"hdfs://localhost:12222/test_02725_3.tsv\"" 2>&1| grep -F "UNKNOWN_TABLE" > /dev/null && echo "OK1" -${CLICKHOUSE_LOCAL} --query "SELECT * FROM \"hdfs://localhost:12222\"" 2>&1| grep -F "UNKNOWN_TABLE" > /dev/null && echo "OK2" ${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ DROP DATABASE IF EXISTS test3; CREATE DATABASE test3 ENGINE = HDFS('abacaba'); -""" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK3" +""" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK0" ${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ DROP DATABASE IF EXISTS test4; CREATE DATABASE test4 ENGINE = HDFS; USE test4; SELECT * FROM \"abacaba/file.tsv\" -""" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK4" +""" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK1" -${CLICKHOUSE_CLIENT} -q "SELECT * FROM test4.\`http://localhost:11111/test/a.tsv\`" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK5" -${CLICKHOUSE_CLIENT} --query "SELECT * FROM test4.\`hdfs://localhost:12222/file.myext\`" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK6" -${CLICKHOUSE_CLIENT} --query "SELECT * FROM test4.\`hdfs://localhost:12222/test_02725_3.tsv\`" 2>&1| grep -F "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK7" -${CLICKHOUSE_CLIENT} --query "SELECT * FROM test4.\`hdfs://localhost:12222\`" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK8" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM test4.\`http://localhost:11111/test/a.tsv\`" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK2" +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test4.\`hdfs://localhost:12222/file.myext\`" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK3" +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test4.\`hdfs://localhost:12222/test_02725_3.tsv\`" 2>&1| grep -F "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK4" +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test4.\`hdfs://localhost:12222\`" 2>&1| grep -F "BAD_ARGUMENTS" > /dev/null && echo "OK5" # Cleanup From 30d216f863583e7216b5cc5f9144f33068092d44 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Wed, 3 May 2023 13:49:46 +0000 Subject: [PATCH 0055/1072] fix --- src/Storages/MergeTree/MergeTreeReadPool.cpp | 43 +++++++++++--------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index b3e3cf1361e..34e270fdd24 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -1,10 +1,10 @@ -#include -#include #include +#include +#include +#include #include #include #include -#include namespace ProfileEvents @@ -56,26 +56,29 @@ MergeTreeReadPool::MergeTreeReadPool( , backoff_settings{context_->getSettingsRef()} , backoff_state{threads_} { - const auto & settings = context_->getSettingsRef(); - - size_t total_compressed_bytes = 0; - size_t total_marks = 0; - for (const auto & part : parts_ranges) + if (std::ranges::count(is_part_on_remote_disk, true)) { - total_compressed_bytes += getApproxSizeOfPart( - *part.data_part, prewhere_info ? prewhere_info->prewhere_actions->getRequiredColumnsNames() : column_names_); - total_marks += part.getMarksCount(); - } + const auto & settings = context_->getSettingsRef(); - if (total_marks) - { - const auto min_bytes_per_task = settings.merge_tree_min_bytes_per_task_for_remote_reading; - const auto avg_mark_bytes = std::max(total_compressed_bytes / total_marks, 1); - /// We're taking min here because number of tasks shouldn't be too low - it will make task stealing impossible. - const auto heuristic_min_marks = std::min(total_marks / threads_ / 8, min_bytes_per_task / avg_mark_bytes); - if (heuristic_min_marks > min_marks_for_concurrent_read) + size_t total_compressed_bytes = 0; + size_t total_marks = 0; + for (const auto & part : parts_ranges) { - min_marks_for_concurrent_read = heuristic_min_marks; + total_compressed_bytes += getApproxSizeOfPart( + *part.data_part, prewhere_info ? prewhere_info->prewhere_actions->getRequiredColumnsNames() : column_names_); + total_marks += part.getMarksCount(); + } + + if (total_marks) + { + const auto min_bytes_per_task = settings.merge_tree_min_bytes_per_task_for_remote_reading; + const auto avg_mark_bytes = std::max(total_compressed_bytes / total_marks, 1); + /// We're taking min here because number of tasks shouldn't be too low - it will make task stealing impossible. + const auto heuristic_min_marks = std::min(total_marks / threads_, min_bytes_per_task / avg_mark_bytes); + if (heuristic_min_marks > min_marks_for_concurrent_read) + { + min_marks_for_concurrent_read = heuristic_min_marks; + } } } From 963d6be120da9c00c583cec2a051f9386de47b0e Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Thu, 4 May 2023 16:44:08 +0000 Subject: [PATCH 0056/1072] Added configurations for DatabaseS3 --- src/Databases/DatabaseFactory.cpp | 13 +- src/Databases/DatabaseS3.cpp | 159 +++++++++++++++--- src/Databases/DatabaseS3.h | 20 ++- tests/config/config.d/named_collection.xml | 5 + .../0_stateless/02724_database_s3.reference | 12 ++ .../queries/0_stateless/02724_database_s3.sh | 28 ++- 6 files changed, 202 insertions(+), 35 deletions(-) diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index 5c4256c8a9f..41ca1de6a0e 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -466,20 +466,15 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String { const ASTFunction * engine = engine_define->engine; - std::string key_id; - std::string secret_key; + DatabaseS3::Configuration config; if (engine->arguments && !engine->arguments->children.empty()) { - if (engine->arguments->children.size() != 2) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "S3 database requires 0 or 2 argument: [access_key_id, secret_access_key]"); - - const auto & arguments = engine->arguments->children; - key_id = safeGetLiteralValue(arguments[0], engine_name); - secret_key = safeGetLiteralValue(arguments[1], engine_name); + ASTs & engine_args = engine->arguments->children; + config = DatabaseS3::parseArguments(engine_args, context); } - return std::make_shared(database_name, key_id, secret_key, context); + return std::make_shared(database_name, config, context); } #endif diff --git a/src/Databases/DatabaseS3.cpp b/src/Databases/DatabaseS3.cpp index d4412ba7973..f4aafc5d03a 100644 --- a/src/Databases/DatabaseS3.cpp +++ b/src/Databases/DatabaseS3.cpp @@ -4,19 +4,36 @@ #include -#include #include +#include +#include #include #include #include -#include #include +#include +#include #include +#include #include +#include +#include + +#include "DatabaseS3.h" + +namespace fs = std::filesystem; + namespace DB { +static const std::unordered_set optional_configuration_keys = { + "url", + "access_key_id", + "secret_access_key", + "no_sign_request" +}; + namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -25,13 +42,14 @@ namespace ErrorCodes extern const int FILE_DOESNT_EXIST; extern const int UNACCEPTABLE_URL; extern const int S3_ERROR; + + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } -DatabaseS3::DatabaseS3(const String & name_, const String & key_id, const String & secret_key, ContextPtr context_) +DatabaseS3::DatabaseS3(const String & name_, const Configuration& config_, ContextPtr context_) : IDatabase(name_) , WithContext(context_->getGlobalContext()) - , access_key_id(key_id) - , secret_access_key(secret_key) + , config(config_) , log(&Poco::Logger::get("DatabaseS3(" + name_ + ")")) { } @@ -49,6 +67,20 @@ void DatabaseS3::addTable(const std::string & table_name, StoragePtr table_stora getEngineName()); } +std::string DatabaseS3::getFullUrl(const std::string & name) const +{ + try + { + S3::URI uri(name); + } + catch (...) + { + return (fs::path(config.url_prefix) / name).string(); + } + + return name; +} + bool DatabaseS3::checkUrl(const std::string & url, ContextPtr context_, bool throw_on_error) const { try @@ -71,36 +103,49 @@ bool DatabaseS3::isTableExist(const String & name, ContextPtr context_) const if (loaded_tables.find(name) != loaded_tables.end()) return true; - return checkUrl(name, context_, false); + return checkUrl(getFullUrl(name), context_, false); } -StoragePtr DatabaseS3::getTableImpl(const String & url, ContextPtr context_) const +StoragePtr DatabaseS3::getTableImpl(const String & name, ContextPtr context_) const { // Check if the table exists in the loaded tables map { std::lock_guard lock(mutex); - auto it = loaded_tables.find(url); + auto it = loaded_tables.find(name); if (it != loaded_tables.end()) return it->second; } + auto url = getFullUrl(name); + checkUrl(url, context_, true); // call TableFunctionS3 - auto args = makeASTFunction( - "s3", - std::make_shared(url), - std::make_shared(access_key_id), - std::make_shared(secret_access_key)); + auto function = std::make_shared(); - auto table_function = TableFunctionFactory::instance().get(args, context_); + function->name = "s3"; + function->arguments = std::make_shared(); + function->children.push_back(function->arguments); + + function->arguments->children.push_back(std::make_shared(url)); + if (config.no_sign_request) + { + function->arguments->children.push_back(std::make_shared("NOSIGN")); + } + else if (config.access_key_id.has_value() && config.secret_access_key.has_value()) + { + function->arguments->children.push_back(std::make_shared(config.access_key_id.value())); + function->arguments->children.push_back(std::make_shared(config.secret_access_key.value())); + } + + auto table_function = TableFunctionFactory::instance().get(function, context_); if (!table_function) return nullptr; // TableFunctionS3 throws exceptions, if table cannot be created - auto table_storage = table_function->execute(args, context_, url); + auto table_storage = table_function->execute(function, context_, name); if (table_storage) - addTable(url, table_storage); + addTable(name, table_storage); return table_storage; } @@ -143,10 +188,14 @@ ASTPtr DatabaseS3::getCreateDatabaseQuery() const auto settings = getContext()->getSettingsRef(); ParserCreateQuery parser; - const String query = fmt::format("CREATE DATABASE {} ENGINE = S3('{}', '{}')", - backQuoteIfNeed(getDatabaseName()), - access_key_id, - secret_access_key); + std::string creation_args; + creation_args += fmt::format("'{}'", config.url_prefix); + if (config.no_sign_request) + creation_args += ", 'NOSIGN'"; + else if (config.access_key_id.has_value() && config.secret_access_key.has_value()) + creation_args += fmt::format(", '{}', '{}'", config.access_key_id.value(), config.secret_access_key.value()); + + const String query = fmt::format("CREATE DATABASE {} ENGINE = S3({})", backQuoteIfNeed(getDatabaseName()), creation_args); ASTPtr ast = parseQuery(parser, query.data(), query.data() + query.size(), "", 0, settings.max_parser_depth); if (const auto database_comment = getDatabaseComment(); !database_comment.empty()) @@ -176,6 +225,76 @@ void DatabaseS3::shutdown() loaded_tables.clear(); } +DatabaseS3::Configuration DatabaseS3::parseArguments(ASTs engine_args, ContextPtr context_) +{ + Configuration result; + + if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, context_)) + { + auto & collection = *named_collection; + + validateNamedCollection(collection, {}, optional_configuration_keys); + + result.url_prefix = collection.getOrDefault("url", ""); + result.no_sign_request = collection.getOrDefault("no_sign_request", false); + + auto key_id = collection.getOrDefault("access_key_id", ""); + auto secret_key = collection.getOrDefault("secret_access_key", ""); + + if (!key_id.empty()) + result.access_key_id = key_id; + + if (!secret_key.empty()) + result.secret_access_key = secret_key; + } + else + { + auto supported_signature = + " - S3()\n" + " - S3('url')\n" + " - S3('url', 'NOSIGN')\n" + " - S3('url', 'access_key_id', 'secret_access_key')\n"; + const auto error_message = + fmt::format("Engine DatabaseS3 must have the following arguments signature\n{}", supported_signature); + + for (auto & arg : engine_args) + arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context_); + + if (engine_args.size() > 3) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, error_message.c_str()); + + if (engine_args.empty()) + return result; + + result.url_prefix = checkAndGetLiteralArgument(engine_args[0], "url"); + + // url, NOSIGN + if (engine_args.size() == 2) + { + auto second_arg = checkAndGetLiteralArgument(engine_args[1], "NOSIGN"); + if (boost::iequals(second_arg, "NOSIGN")) + result.no_sign_request = true; + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, error_message.c_str()); + } + + // url, access_key_id, secret_access_key + if (engine_args.size() == 3) + { + auto key_id = checkAndGetLiteralArgument(engine_args[1], "access_key_id"); + auto secret_key = checkAndGetLiteralArgument(engine_args[2], "secret_access_key"); + + if (key_id.empty() || secret_key.empty() || boost::iequals(key_id, "NOSIGN")) + throw Exception(ErrorCodes::BAD_ARGUMENTS, error_message.c_str()); + + result.access_key_id = key_id; + result.secret_access_key = secret_key; + } + } + + return result; +} + /** * Returns an empty vector because the database is read-only and no tables can be backed up */ diff --git a/src/Databases/DatabaseS3.h b/src/Databases/DatabaseS3.h index d5269e57f5a..65f80dca2ba 100644 --- a/src/Databases/DatabaseS3.h +++ b/src/Databases/DatabaseS3.h @@ -23,7 +23,17 @@ class Context; class DatabaseS3 : public IDatabase, protected WithContext { public: - DatabaseS3(const String & name, const String & key_id, const String & secret_key, ContextPtr context); + struct Configuration + { + std::string url_prefix; + + bool no_sign_request = false; + + std::optional access_key_id; + std::optional secret_access_key; + }; + + DatabaseS3(const String & name, const Configuration& config, ContextPtr context); String getEngineName() const override { return "S3"; } @@ -44,6 +54,8 @@ public: std::vector> getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const override; DatabaseTablesIteratorPtr getTablesIterator(ContextPtr, const FilterByNameFunction &) const override; + static Configuration parseArguments(ASTs engine_args, ContextPtr context); + protected: StoragePtr getTableImpl(const String & url, ContextPtr context) const; @@ -51,9 +63,11 @@ protected: bool checkUrl(const std::string & url, ContextPtr context_, bool throw_on_error) const; + std::string getFullUrl(const std::string & name) const; + private: - const String access_key_id; - const String secret_access_key; + const Configuration config; + mutable Tables loaded_tables TSA_GUARDED_BY(mutex); Poco::Logger * log; }; diff --git a/tests/config/config.d/named_collection.xml b/tests/config/config.d/named_collection.xml index 2e49c0c596f..5b716a7b8da 100644 --- a/tests/config/config.d/named_collection.xml +++ b/tests/config/config.d/named_collection.xml @@ -32,5 +32,10 @@ testtest auto + + http://localhost:11111/test/ + test + testtest + diff --git a/tests/queries/0_stateless/02724_database_s3.reference b/tests/queries/0_stateless/02724_database_s3.reference index 72ba0e240b1..811e38b7f2b 100644 --- a/tests/queries/0_stateless/02724_database_s3.reference +++ b/tests/queries/0_stateless/02724_database_s3.reference @@ -8,6 +8,18 @@ test1 13 14 15 16 17 18 0 0 0 +10 11 12 +13 14 15 +16 17 18 +0 0 0 +1 2 3 +4 5 6 +7 8 9 +0 0 0 +10 11 12 +13 14 15 +16 17 18 +0 0 0 Test 2: check exceptions OK OK diff --git a/tests/queries/0_stateless/02724_database_s3.sh b/tests/queries/0_stateless/02724_database_s3.sh index 2758580a355..ac1b97beecf 100755 --- a/tests/queries/0_stateless/02724_database_s3.sh +++ b/tests/queries/0_stateless/02724_database_s3.sh @@ -15,15 +15,35 @@ USE test1; SELECT * FROM \"http://localhost:11111/test/a.tsv\" """ ${CLICKHOUSE_CLIENT} -q "SHOW DATABASES;" | grep test1 -${CLICKHOUSE_CLIENT} -q "DROP DATABASE test1;" +# check credentials with absolute path ${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ DROP DATABASE IF EXISTS test2; -CREATE DATABASE test2 ENGINE = S3('test', 'testtest'); +CREATE DATABASE test2 ENGINE = S3('', 'test', 'testtest'); USE test2; SELECT * FROM \"http://localhost:11111/test/b.tsv\" """ -${CLICKHOUSE_CLIENT} -q "DROP DATABASE test2;" + +# check credentials with relative path +${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test4; +CREATE DATABASE test4 ENGINE = S3('http://localhost:11111/test', 'test', 'testtest'); +USE test4; +SELECT * FROM \"b.tsv\" +""" + +# check that database url_prefix is ignored if pass full url as table name +${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ +USE test4; +SELECT * FROM \"http://localhost:11111/test/a.tsv\" +""" + +# Check named collection loading +${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test5; +CREATE DATABASE test5 ENGINE = S3(s3_conn_db); +SELECT * FROM test5.\`b.tsv\` +""" ################# echo "Test 2: check exceptions" @@ -44,4 +64,6 @@ ${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ DROP DATABASE IF EXISTS test1; DROP DATABASE IF EXISTS test2; DROP DATABASE IF EXISTS test3; +DROP DATABASE IF EXISTS test4; +DROP DATABASE IF EXISTS test5; """ From f083372c0cd5618b07873e886078ac86aecf54cd Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Thu, 4 May 2023 16:46:51 +0000 Subject: [PATCH 0057/1072] remove extra include --- src/Databases/DatabaseS3.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Databases/DatabaseS3.cpp b/src/Databases/DatabaseS3.cpp index f4aafc5d03a..5529f582572 100644 --- a/src/Databases/DatabaseS3.cpp +++ b/src/Databases/DatabaseS3.cpp @@ -20,8 +20,6 @@ #include #include -#include "DatabaseS3.h" - namespace fs = std::filesystem; namespace DB From 814a3f04cd421c991e6976fa314f13e96e96069f Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Thu, 4 May 2023 17:12:35 +0000 Subject: [PATCH 0058/1072] fix style --- src/Databases/DatabaseS3.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Databases/DatabaseS3.cpp b/src/Databases/DatabaseS3.cpp index 5529f582572..bc318ecd9bf 100644 --- a/src/Databases/DatabaseS3.cpp +++ b/src/Databases/DatabaseS3.cpp @@ -253,7 +253,7 @@ DatabaseS3::Configuration DatabaseS3::parseArguments(ASTs engine_args, ContextPt " - S3('url', 'NOSIGN')\n" " - S3('url', 'access_key_id', 'secret_access_key')\n"; const auto error_message = - fmt::format("Engine DatabaseS3 must have the following arguments signature\n{}", supported_signature); + fmt::format("Engine DatabaseS3 must have the following arguments signature\n{}", supported_signature); for (auto & arg : engine_args) arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context_); From 367583b96ea53a5203163a90e49581b2bdf225de Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Thu, 4 May 2023 17:38:41 +0000 Subject: [PATCH 0059/1072] retriger checks From e1151f150f23e0bbcb52ae0a1a3ef01a0ecb97da Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Fri, 5 May 2023 18:37:25 +0000 Subject: [PATCH 0060/1072] Fix clang build errors --- src/Databases/DatabaseS3.cpp | 2 +- src/Databases/DatabaseS3.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Databases/DatabaseS3.cpp b/src/Databases/DatabaseS3.cpp index bc318ecd9bf..96616426475 100644 --- a/src/Databases/DatabaseS3.cpp +++ b/src/Databases/DatabaseS3.cpp @@ -247,7 +247,7 @@ DatabaseS3::Configuration DatabaseS3::parseArguments(ASTs engine_args, ContextPt } else { - auto supported_signature = + const std::string supported_signature = " - S3()\n" " - S3('url')\n" " - S3('url', 'NOSIGN')\n" diff --git a/src/Databases/DatabaseS3.h b/src/Databases/DatabaseS3.h index 65f80dca2ba..4e6910566df 100644 --- a/src/Databases/DatabaseS3.h +++ b/src/Databases/DatabaseS3.h @@ -57,7 +57,7 @@ public: static Configuration parseArguments(ASTs engine_args, ContextPtr context); protected: - StoragePtr getTableImpl(const String & url, ContextPtr context) const; + StoragePtr getTableImpl(const String & name, ContextPtr context) const; void addTable(const std::string & table_name, StoragePtr table_storage) const; From 18d1a4356d2ba1e7502d0ba207e6ac8f53fc3e02 Mon Sep 17 00:00:00 2001 From: tpanetti Date: Fri, 5 May 2023 12:19:35 -0700 Subject: [PATCH 0061/1072] Change SHOW COLUMNS query to display MySQL types in MySQL Compatibility mode This updates the SHOW COLUMN SQL query to display MySQL types when this query is issued by a client connected via MySQL Compatibility port --- .../InterpreterShowColumnsQuery.cpp | 78 ++++++- .../InterpreterShowColumnsQuery.h | 1 + ...show_columns_mysql_compatibility.reference | 213 ++++++++++++++++++ .../02726_show_columns_mysql_compatibility.sh | 115 ++++++++++ 4 files changed, 405 insertions(+), 2 deletions(-) create mode 100644 tests/queries/0_stateless/02726_show_columns_mysql_compatibility.reference create mode 100755 tests/queries/0_stateless/02726_show_columns_mysql_compatibility.sh diff --git a/src/Interpreters/InterpreterShowColumnsQuery.cpp b/src/Interpreters/InterpreterShowColumnsQuery.cpp index 4474be21d8b..17ccafdd1ce 100644 --- a/src/Interpreters/InterpreterShowColumnsQuery.cpp +++ b/src/Interpreters/InterpreterShowColumnsQuery.cpp @@ -26,11 +26,17 @@ String InterpreterShowColumnsQuery::getRewrittenQuery() WriteBufferFromOwnString rewritten_query; - rewritten_query << "SELECT name AS field, type AS type, startsWith(type, 'Nullable') AS null, trim(concatWithSeparator(' ', if(is_in_primary_key, 'PRI', ''), if (is_in_sorting_key, 'SOR', ''))) AS key, if(default_kind IN ('ALIAS', 'DEFAULT', 'MATERIALIZED'), default_expression, NULL) AS default, '' AS extra "; - // TODO Interpret query.extended. It is supposed to show internal/virtual columns. Need to fetch virtual column names, see // IStorage::getVirtuals(). We can't easily do that via SQL. + // If connected via MySQL Compatibility mode, convert ClickHouse types to MySQL + if (getContext()->getClientInfo().interface == DB::ClientInfo::Interface::MYSQL) + { + rewritten_query << getMySQLQuery(); + } + else { + rewritten_query << "SELECT name AS field, type AS type, startsWith(type, 'Nullable') AS null, trim(concatWithSeparator(' ', if(is_in_primary_key, 'PRI', ''), if (is_in_sorting_key, 'SOR', ''))) AS key, if(default_kind IN ('ALIAS', 'DEFAULT', 'MATERIALIZED'), default_expression, NULL) AS default, '' AS extra "; + } if (query.full) { /// "Full" mode is mostly for MySQL compat @@ -93,6 +99,74 @@ String InterpreterShowColumnsQuery::getRewrittenQuery() } +String InterpreterShowColumnsQuery::getMySQLQuery() +{ + WriteBufferFromOwnString mysql_specific_query; + + mysql_specific_query << "SELECT name AS field, " + << "CASE " + << " WHEN startsWith(type, 'Nullable') THEN " + << " CASE " + << " WHEN substring(type, 10, length(type) - 10) IN ('UInt8', 'Int8') THEN 'tinyint' " + << " WHEN substring(type, 10, length(type) - 10) IN ('UInt16', 'Int16') THEN 'smallint' " + << " WHEN substring(type, 10, length(type) - 10) IN ('UInt32', 'Int32') THEN 'int' " + << " WHEN substring(type, 10, length(type) - 10) IN ('UInt64', 'Int64', 'UInt128', 'Int128', 'UInt256', 'Int256') THEN 'bigint' " + << " WHEN substring(type, 10, length(type) - 10) = 'Float32' THEN 'float' " + << " WHEN substring(type, 10, length(type) - 10) = 'Float64' THEN 'double' " + << " WHEN substring(type, 10, length(type) - 10) LIKE 'Decimal%' THEN 'decimal' " + << " WHEN substring(type, 10, length(type) - 10) = 'Boolean' THEN 'tinyint' " + << " WHEN substring(type, 10, length(type) - 10) = 'String' THEN 'text' " + << " WHEN substring(type, 10, length(type) - 10) LIKE 'FixedString%' THEN 'text' " + << " WHEN substring(type, 10, length(type) - 10) LIKE 'Date%' THEN 'date' " + << " WHEN substring(type, 10, length(type) - 10) LIKE 'DateTime%' THEN 'datetime' " + << " WHEN substring(type, 10, length(type) - 10) = 'JSON' THEN 'json' " + << " WHEN substring(type, 10, length(type) - 10) = 'UUID' THEN 'binary' " + << " WHEN substring(type, 10, length(type) - 10) LIKE 'Enum%' THEN 'enum' " + << " WHEN substring(type, 10, length(type) - 10) LIKE 'LowCardinality%' THEN 'text' " + << " WHEN substring(type, 10, length(type) - 10) LIKE 'Array%' THEN 'json' " + << " WHEN substring(type, 10, length(type) - 10) LIKE 'Map%' THEN 'json' " + << " WHEN substring(type, 10, length(type) - 10) IN ('SimpleAggregateFunction', 'AggregateFunction') THEN 'text' " + << " WHEN substring(type, 10, length(type) - 10) = 'Nested' THEN 'json' " + << " WHEN substring(type, 10, length(type) - 10) LIKE 'Tuple%' THEN 'json' " + << " WHEN substring(type, 10, length(type) - 10) LIKE 'IPv%' THEN 'text' " + << " WHEN substring(type, 10, length(type) - 10) IN ('Expression', 'Set', 'Nothing', 'Interval') THEN 'text' " + << " ELSE substring(type, 10, length(type) - 10) " + << " END " + << " ELSE " + << " CASE " + << " WHEN type IN ('UInt8', 'Int8') THEN 'tinyint' " + << " WHEN type IN ('UInt16', 'Int16') THEN 'smallint' " + << " WHEN type IN ('UInt32', 'Int32') THEN 'int' " + << " WHEN type IN ('UInt64', 'Int64', 'UInt128', 'Int128', 'UInt256', 'Int256') THEN 'bigint' " + << " WHEN type = 'Float32' THEN 'float' " + << " WHEN type = 'Float64' THEN 'double' " + << " WHEN type LIKE 'Decimal%' THEN 'decimal' " + << " WHEN type = 'Boolean' THEN 'tinyint' " + << " WHEN type = 'String' THEN 'text' " + << " WHEN type LIKE 'FixedString%' THEN 'text' " + << " WHEN type LIKE 'Date%' THEN 'date' " + << " WHEN type LIKE 'DateTime%' THEN 'datetime' " + << " WHEN type = 'JSON' THEN 'json' " + << " WHEN type = 'UUID' THEN 'binary' " + << " WHEN type LIKE 'Enum%' THEN 'enum' " + << " WHEN type LIKE 'LowCardinality%' THEN 'text' " + << " WHEN type LIKE 'Array%' THEN 'json' " + << " WHEN type LIKE 'Map%' THEN 'json' " + << " WHEN type IN ('SimpleAggregateFunction', 'AggregateFunction') THEN 'text' " + << " WHEN type = 'Nested' THEN 'json' " + << " WHEN type LIKE 'Tuple%' THEN 'json' " + << " WHEN type LIKE 'IPv%' THEN 'text' " + << " WHEN type IN ('Expression', 'Set', 'Nothing', 'Interval') THEN 'text' " + << " ELSE type " + << " END " + << "END AS type, " + << "startsWith(type, 'Nullable') AS null, " + << "trim(concatWithSeparator(' ', if(is_in_primary_key, 'PRI', ''), if (is_in_sorting_key, 'SOR', ''))) AS key, " + << "if(default_kind IN ('ALIAS', 'DEFAULT', 'MATERIALIZED'), default_expression, NULL) AS default, " + << "'' AS extra "; + + return mysql_specific_query.str(); +} BlockIO InterpreterShowColumnsQuery::execute() { diff --git a/src/Interpreters/InterpreterShowColumnsQuery.h b/src/Interpreters/InterpreterShowColumnsQuery.h index ee6dcabd97b..b843a163978 100644 --- a/src/Interpreters/InterpreterShowColumnsQuery.h +++ b/src/Interpreters/InterpreterShowColumnsQuery.h @@ -26,6 +26,7 @@ private: ASTPtr query_ptr; String getRewrittenQuery(); + String getMySQLQuery(); }; diff --git a/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.reference b/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.reference new file mode 100644 index 00000000000..c9ad94a34c4 --- /dev/null +++ b/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.reference @@ -0,0 +1,213 @@ +Drop tables if they exist +Create tab table +Create pseudo-random database name +Create tab duplicate table +Run MySQL test +field type null key default extra +array_value json 0 NULL +boolean_value tinyint 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value date 0 NULL +datetime_value date 0 NULL +decimal_value decimal 0 NULL +enum_value enum 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +int32 int 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value text 0 NULL +low_cardinality text 0 NULL +map_value json 0 NULL +nested.nested_int json 0 NULL +nested.nested_string json 0 NULL +nullable_value int 0 NULL +string_value text 0 NULL +tuple_value json 0 NULL +uint64 bigint 0 PRI SOR NULL +uuid_value binary 0 NULL +field type null key default extra +array_value json 0 NULL +boolean_value tinyint 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value date 0 NULL +datetime_value date 0 NULL +decimal_value decimal 0 NULL +enum_value enum 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +int32 int 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value text 0 NULL +low_cardinality text 0 NULL +map_value json 0 NULL +nested.nested_int json 0 NULL +nested.nested_string json 0 NULL +nullable_value int 0 NULL +string_value text 0 NULL +tuple_value json 0 NULL +uint64 bigint 0 PRI SOR NULL +uuid_value binary 0 NULL +field type null key default extra collation comment privileges +array_value json 0 NULL NULL +boolean_value tinyint 0 NULL NULL +date32_value date 0 NULL NULL +date_value date 0 NULL NULL +datetime64_value date 0 NULL NULL +datetime_value date 0 NULL NULL +decimal_value decimal 0 NULL NULL +enum_value enum 0 NULL NULL +fixed_string_value text 0 NULL NULL +float32 float 0 NULL NULL +float64 double 0 NULL NULL +int32 int 0 NULL NULL +ipv4_value text 0 NULL NULL +ipv6_value text 0 NULL NULL +json_value text 0 NULL NULL +low_cardinality text 0 NULL NULL +map_value json 0 NULL NULL +nested.nested_int json 0 NULL NULL +nested.nested_string json 0 NULL NULL +nullable_value int 0 NULL NULL +string_value text 0 NULL NULL +tuple_value json 0 NULL NULL +uint64 bigint 0 PRI SOR NULL NULL +uuid_value binary 0 NULL NULL +field type null key default extra +int32 int 0 NULL +nested.nested_int json 0 NULL +uint64 bigint 0 PRI SOR NULL +field type null key default extra +array_value json 0 NULL +boolean_value tinyint 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value date 0 NULL +datetime_value date 0 NULL +decimal_value decimal 0 NULL +enum_value enum 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value text 0 NULL +low_cardinality text 0 NULL +map_value json 0 NULL +nested.nested_string json 0 NULL +nullable_value int 0 NULL +string_value text 0 NULL +tuple_value json 0 NULL +uuid_value binary 0 NULL +field type null key default extra +int32 int 0 NULL +nested.nested_int json 0 NULL +uint64 bigint 0 PRI SOR NULL +field type null key default extra +array_value json 0 NULL +boolean_value tinyint 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value date 0 NULL +datetime_value date 0 NULL +decimal_value decimal 0 NULL +enum_value enum 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value text 0 NULL +low_cardinality text 0 NULL +map_value json 0 NULL +nested.nested_string json 0 NULL +nullable_value int 0 NULL +string_value text 0 NULL +tuple_value json 0 NULL +uuid_value binary 0 NULL +field type null key default extra +int32 int 0 NULL +nested.nested_int json 0 NULL +uint64 bigint 0 PRI SOR NULL +field type null key default extra +array_value json 0 NULL +field type null key default extra +array_value json 0 NULL +boolean_value tinyint 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value date 0 NULL +datetime_value date 0 NULL +decimal_value decimal 0 NULL +enum_value enum 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +int32 int 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value text 0 NULL +low_cardinality text 0 NULL +map_value json 0 NULL +nested.nested_int json 0 NULL +nested.nested_string json 0 NULL +nullable_value int 0 NULL +string_value text 0 NULL +tuple_value json 0 NULL +uint64 bigint 0 PRI SOR NULL +uuid_value binary 0 NULL +field type null key default extra +array_value json 0 NULL +boolean_value tinyint 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value date 0 NULL +datetime_value date 0 NULL +decimal_value decimal 0 NULL +enum_value enum 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +int32 int 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value text 0 NULL +low_cardinality text 0 NULL +map_value json 0 NULL +nested.nested_int json 0 NULL +nested.nested_string json 0 NULL +nullable_value int 0 NULL +string_value text 0 NULL +tuple_value json 0 NULL +uint64 bigint 0 PRI SOR NULL +uuid_value binary 0 NULL +field type null key default extra +array_value json 0 NULL +boolean_value tinyint 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value date 0 NULL +datetime_value date 0 NULL +decimal_value decimal 0 NULL +enum_value enum 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +int32 int 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value text 0 NULL +low_cardinality text 0 NULL +map_value json 0 NULL +nested.nested_int json 0 NULL +nested.nested_string json 0 NULL +nullable_value int 0 NULL +string_value text 0 NULL +tuple_value json 0 NULL +uint64 bigint 0 PRI SOR NULL +uuid_value binary 0 NULL diff --git a/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.sh b/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.sh new file mode 100755 index 00000000000..5324496edd3 --- /dev/null +++ b/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.sh @@ -0,0 +1,115 @@ +#!/bin/bash + +# This script tests the MySQL compatibility of the SHOW COLUMNS command in ClickHouse +USER="default" +PASSWORD="" +HOST="127.0.0.1" +PORT=9004 + +# First run the clickhouse test to create the ClickHouse Tables + +echo "Drop tables if they exist" +${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS tab" +${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS database_123456789abcde" +${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS database_123456789abcde.tab" + +echo "Create tab table " +${CLICKHOUSE_LOCAL} --query " + CREATE TABLE tab + ( + uint64 UInt64, + int32 Nullable(Int32), + float32 Float32, + float64 Float64, + decimal_value Decimal(10, 2), + boolean_value UInt8, -- Use 0 for false, 1 for true + string_value String, + fixed_string_value FixedString(10), + date_value Date, + date32_value Date32, + datetime_value DateTime, + datetime64_value DateTime64(3), + json_value String, -- Store JSON as a string + uuid_value UUID, + enum_value Enum8('apple' = 1, 'banana' = 2, 'orange' = 3), + low_cardinality LowCardinality(String), + array_value Array(Int32), + map_value Map(String, Int32), + tuple_value Tuple(Int32, String), + nullable_value Nullable(Int32), + ipv4_value IPv4, + ipv6_value IPv6, + nested Nested + ( + nested_int Int32, + nested_string String + ) + ) ENGINE = MergeTree + ORDER BY uint64; + " + + +echo "Create pseudo-random database name" +${CLICKHOUSE_LOCAL} --query "CREATE DATABASE database_123456789abcde;" + +echo "Create tab duplicate table" +${CLICKHOUSE_LOCAL} --query " + CREATE TABLE database_123456789abcde.tab + ( + uint64 UInt64, + int32 Nullable(Int32), + float32 Float32, + float64 Float64, + decimal_value Decimal(10, 2), + boolean_value UInt8, -- Use 0 for false, 1 for true + string_value String, + fixed_string_value FixedString(10), + date_value Date, + date32_value Date32, + datetime_value DateTime, + datetime64_value DateTime64(3), + json_value String, -- Store JSON as a string + uuid_value UUID, + enum_value Enum8('apple' = 1, 'banana' = 2, 'orange' = 3), + low_cardinality LowCardinality(String), + array_value Array(Int32), + map_value Map(String, Int32), + tuple_value Tuple(Int32, String), + nullable_value Nullable(Int32), + ipv4_value IPv4, + ipv6_value IPv6, + nested Nested + ( + nested_int Int32, + nested_string String + ) + ) ENGINE = MergeTree + ORDER BY uint64; + " + +# Write sql to temp file +TEMP_FILE=$(mktemp) + +cat < $TEMP_FILE +SHOW COLUMNS FROM tab; +SHOW EXTENDED COLUMNS FROM tab; +SHOW FULL COLUMNS FROM tab; +SHOW COLUMNS FROM tab LIKE '%int%'; +SHOW COLUMNS FROM tab NOT LIKE '%int%'; +SHOW COLUMNS FROM tab ILIKE '%INT%'; +SHOW COLUMNS FROM tab NOT ILIKE '%INT%'; +SHOW COLUMNS FROM tab WHERE field LIKE '%int%'; +SHOW COLUMNS FROM tab LIMIT 1; +SHOW COLUMNS FROM tab; +SHOW COLUMNS FROM tab FROM database_123456789abcde; +SHOW COLUMNS FROM database_123456789abcde.tab; +DROP DATABASE database_123456789abcde; +DROP TABLE tab; +EOT + +# Now run the MySQL test script on the ClickHouse DB +echo "Run MySQL test" +mysql --user="$USER" --password="$PASSWORD" --host="$HOST" --port="$PORT" < $TEMP_FILE + +# Clean up the temp file +rm $TEMP_FILE From ddbad79c5e67518acebbacaad5be0cad3967ac67 Mon Sep 17 00:00:00 2001 From: tpanetti Date: Fri, 5 May 2023 12:19:35 -0700 Subject: [PATCH 0062/1072] Change SHOW COLUMNS query to display MySQL types in MySQL Compatibility mode This updates the SHOW COLUMN SQL query to display MySQL types when this query is issued by a client connected via MySQL Compatibility port --- .../InterpreterShowColumnsQuery.cpp | 76 +++++++ .../InterpreterShowColumnsQuery.h | 1 + ...show_columns_mysql_compatibility.reference | 213 ++++++++++++++++++ .../02726_show_columns_mysql_compatibility.sh | 115 ++++++++++ 4 files changed, 405 insertions(+) create mode 100644 tests/queries/0_stateless/02726_show_columns_mysql_compatibility.reference create mode 100755 tests/queries/0_stateless/02726_show_columns_mysql_compatibility.sh diff --git a/src/Interpreters/InterpreterShowColumnsQuery.cpp b/src/Interpreters/InterpreterShowColumnsQuery.cpp index c86d3c753c4..c545c621abb 100644 --- a/src/Interpreters/InterpreterShowColumnsQuery.cpp +++ b/src/Interpreters/InterpreterShowColumnsQuery.cpp @@ -45,6 +45,14 @@ SELECT // TODO Interpret query.extended. It is supposed to show internal/virtual columns. Need to fetch virtual column names, see // IStorage::getVirtuals(). We can't easily do that via SQL. + // If connected via MySQL Compatibility mode, convert ClickHouse types to MySQL + if (getContext()->getClientInfo().interface == DB::ClientInfo::Interface::MYSQL) + { + rewritten_query += getMySQLQuery(); + } + else { + rewritten_query += "SELECT name AS field, type AS type, startsWith(type, 'Nullable') AS null, trim(concatWithSeparator(' ', if(is_in_primary_key, 'PRI', ''), if (is_in_sorting_key, 'SOR', ''))) AS key, if(default_kind IN ('ALIAS', 'DEFAULT', 'MATERIALIZED'), default_expression, NULL) AS default, '' AS extra "; + } if (query.full) { /// "Full" mode is mostly for MySQL compat @@ -88,6 +96,74 @@ WHERE return rewritten_query; } +String InterpreterShowColumnsQuery::getMySQLQuery() +{ + String mysql_specific_query; + + mysql_specific_query = R"(SELECT name AS field, + CASE + WHEN startsWith(type, 'Nullable') THEN + CASE + WHEN substring(type, 10, length(type) - 10) IN ('UInt8', 'Int8') THEN 'tinyint' + WHEN substring(type, 10, length(type) - 10) IN ('UInt16', 'Int16') THEN 'smallint' + WHEN substring(type, 10, length(type) - 10) IN ('UInt32', 'Int32') THEN 'int' + WHEN substring(type, 10, length(type) - 10) IN ('UInt64', 'Int64', 'UInt128', 'Int128', 'UInt256', 'Int256') THEN 'bigint' + WHEN substring(type, 10, length(type) - 10) = 'Float32' THEN 'float' + WHEN substring(type, 10, length(type) - 10) = 'Float64' THEN 'double' + WHEN substring(type, 10, length(type) - 10) LIKE 'Decimal%' THEN 'decimal' + WHEN substring(type, 10, length(type) - 10) = 'Boolean' THEN 'tinyint' + WHEN substring(type, 10, length(type) - 10) = 'String' THEN 'text' + WHEN substring(type, 10, length(type) - 10) LIKE 'FixedString%' THEN 'text' + WHEN substring(type, 10, length(type) - 10) LIKE 'Date%' THEN 'date' + WHEN substring(type, 10, length(type) - 10) LIKE 'DateTime%' THEN 'datetime' + WHEN substring(type, 10, length(type) - 10) = 'JSON' THEN 'json' + WHEN substring(type, 10, length(type) - 10) = 'UUID' THEN 'binary' + WHEN substring(type, 10, length(type) - 10) LIKE 'Enum%' THEN 'enum' + WHEN substring(type, 10, length(type) - 10) LIKE 'LowCardinality%' THEN 'text' + WHEN substring(type, 10, length(type) - 10) LIKE 'Array%' THEN 'json' + WHEN substring(type, 10, length(type) - 10) LIKE 'Map%' THEN 'json' + WHEN substring(type, 10, length(type) - 10) IN ('SimpleAggregateFunction', 'AggregateFunction') THEN 'text' + WHEN substring(type, 10, length(type) - 10) = 'Nested' THEN 'json' + WHEN substring(type, 10, length(type) - 10) LIKE 'Tuple%' THEN 'json' + WHEN substring(type, 10, length(type) - 10) LIKE 'IPv%' THEN 'text' + WHEN substring(type, 10, length(type) - 10) IN ('Expression', 'Set', 'Nothing', 'Interval') THEN 'text' + ELSE substring(type, 10, length(type) - 10) + END + ELSE + CASE + WHEN type IN ('UInt8', 'Int8') THEN 'tinyint' + WHEN type IN ('UInt16', 'Int16') THEN 'smallint' + WHEN type IN ('UInt32', 'Int32') THEN 'int' + WHEN type IN ('UInt64', 'Int64', 'UInt128', 'Int128', 'UInt256', 'Int256') THEN 'bigint' + WHEN type = 'Float32' THEN 'float' + WHEN type = 'Float64' THEN 'double' + WHEN type LIKE 'Decimal%' THEN 'decimal' + WHEN type = 'Boolean' THEN 'tinyint' + WHEN type = 'String' THEN 'text' + WHEN type LIKE 'FixedString%' THEN 'text' + WHEN type LIKE 'Date%' THEN 'date' + WHEN type LIKE 'DateTime%' THEN 'datetime' + WHEN type = 'JSON' THEN 'json' + WHEN type = 'UUID' THEN 'binary' + WHEN type LIKE 'Enum%' THEN 'enum' + WHEN type LIKE 'LowCardinality%' THEN 'text' + WHEN type LIKE 'Array%' THEN 'json' + WHEN type LIKE 'Map%' THEN 'json' + WHEN type IN ('SimpleAggregateFunction', 'AggregateFunction') THEN 'text' + WHEN type = 'Nested' THEN 'json' + WHEN type LIKE 'Tuple%' THEN 'json' + WHEN type LIKE 'IPv%' THEN 'text' + WHEN type IN ('Expression', 'Set', 'Nothing', 'Interval') THEN 'text' + ELSE type + END + END AS type, + startsWith(type, 'Nullable') AS null, + trim(concatWithSeparator(' ', if(is_in_primary_key, 'PRI', ''), if (is_in_sorting_key, 'SOR', ''))) AS key, + if(default_kind IN ('ALIAS', 'DEFAULT', 'MATERIALIZED'), default_expression, NULL) AS default, + '' AS extra )"; + + return mysql_specific_query.str(); +} BlockIO InterpreterShowColumnsQuery::execute() { diff --git a/src/Interpreters/InterpreterShowColumnsQuery.h b/src/Interpreters/InterpreterShowColumnsQuery.h index ee6dcabd97b..b843a163978 100644 --- a/src/Interpreters/InterpreterShowColumnsQuery.h +++ b/src/Interpreters/InterpreterShowColumnsQuery.h @@ -26,6 +26,7 @@ private: ASTPtr query_ptr; String getRewrittenQuery(); + String getMySQLQuery(); }; diff --git a/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.reference b/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.reference new file mode 100644 index 00000000000..c9ad94a34c4 --- /dev/null +++ b/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.reference @@ -0,0 +1,213 @@ +Drop tables if they exist +Create tab table +Create pseudo-random database name +Create tab duplicate table +Run MySQL test +field type null key default extra +array_value json 0 NULL +boolean_value tinyint 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value date 0 NULL +datetime_value date 0 NULL +decimal_value decimal 0 NULL +enum_value enum 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +int32 int 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value text 0 NULL +low_cardinality text 0 NULL +map_value json 0 NULL +nested.nested_int json 0 NULL +nested.nested_string json 0 NULL +nullable_value int 0 NULL +string_value text 0 NULL +tuple_value json 0 NULL +uint64 bigint 0 PRI SOR NULL +uuid_value binary 0 NULL +field type null key default extra +array_value json 0 NULL +boolean_value tinyint 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value date 0 NULL +datetime_value date 0 NULL +decimal_value decimal 0 NULL +enum_value enum 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +int32 int 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value text 0 NULL +low_cardinality text 0 NULL +map_value json 0 NULL +nested.nested_int json 0 NULL +nested.nested_string json 0 NULL +nullable_value int 0 NULL +string_value text 0 NULL +tuple_value json 0 NULL +uint64 bigint 0 PRI SOR NULL +uuid_value binary 0 NULL +field type null key default extra collation comment privileges +array_value json 0 NULL NULL +boolean_value tinyint 0 NULL NULL +date32_value date 0 NULL NULL +date_value date 0 NULL NULL +datetime64_value date 0 NULL NULL +datetime_value date 0 NULL NULL +decimal_value decimal 0 NULL NULL +enum_value enum 0 NULL NULL +fixed_string_value text 0 NULL NULL +float32 float 0 NULL NULL +float64 double 0 NULL NULL +int32 int 0 NULL NULL +ipv4_value text 0 NULL NULL +ipv6_value text 0 NULL NULL +json_value text 0 NULL NULL +low_cardinality text 0 NULL NULL +map_value json 0 NULL NULL +nested.nested_int json 0 NULL NULL +nested.nested_string json 0 NULL NULL +nullable_value int 0 NULL NULL +string_value text 0 NULL NULL +tuple_value json 0 NULL NULL +uint64 bigint 0 PRI SOR NULL NULL +uuid_value binary 0 NULL NULL +field type null key default extra +int32 int 0 NULL +nested.nested_int json 0 NULL +uint64 bigint 0 PRI SOR NULL +field type null key default extra +array_value json 0 NULL +boolean_value tinyint 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value date 0 NULL +datetime_value date 0 NULL +decimal_value decimal 0 NULL +enum_value enum 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value text 0 NULL +low_cardinality text 0 NULL +map_value json 0 NULL +nested.nested_string json 0 NULL +nullable_value int 0 NULL +string_value text 0 NULL +tuple_value json 0 NULL +uuid_value binary 0 NULL +field type null key default extra +int32 int 0 NULL +nested.nested_int json 0 NULL +uint64 bigint 0 PRI SOR NULL +field type null key default extra +array_value json 0 NULL +boolean_value tinyint 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value date 0 NULL +datetime_value date 0 NULL +decimal_value decimal 0 NULL +enum_value enum 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value text 0 NULL +low_cardinality text 0 NULL +map_value json 0 NULL +nested.nested_string json 0 NULL +nullable_value int 0 NULL +string_value text 0 NULL +tuple_value json 0 NULL +uuid_value binary 0 NULL +field type null key default extra +int32 int 0 NULL +nested.nested_int json 0 NULL +uint64 bigint 0 PRI SOR NULL +field type null key default extra +array_value json 0 NULL +field type null key default extra +array_value json 0 NULL +boolean_value tinyint 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value date 0 NULL +datetime_value date 0 NULL +decimal_value decimal 0 NULL +enum_value enum 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +int32 int 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value text 0 NULL +low_cardinality text 0 NULL +map_value json 0 NULL +nested.nested_int json 0 NULL +nested.nested_string json 0 NULL +nullable_value int 0 NULL +string_value text 0 NULL +tuple_value json 0 NULL +uint64 bigint 0 PRI SOR NULL +uuid_value binary 0 NULL +field type null key default extra +array_value json 0 NULL +boolean_value tinyint 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value date 0 NULL +datetime_value date 0 NULL +decimal_value decimal 0 NULL +enum_value enum 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +int32 int 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value text 0 NULL +low_cardinality text 0 NULL +map_value json 0 NULL +nested.nested_int json 0 NULL +nested.nested_string json 0 NULL +nullable_value int 0 NULL +string_value text 0 NULL +tuple_value json 0 NULL +uint64 bigint 0 PRI SOR NULL +uuid_value binary 0 NULL +field type null key default extra +array_value json 0 NULL +boolean_value tinyint 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value date 0 NULL +datetime_value date 0 NULL +decimal_value decimal 0 NULL +enum_value enum 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +int32 int 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value text 0 NULL +low_cardinality text 0 NULL +map_value json 0 NULL +nested.nested_int json 0 NULL +nested.nested_string json 0 NULL +nullable_value int 0 NULL +string_value text 0 NULL +tuple_value json 0 NULL +uint64 bigint 0 PRI SOR NULL +uuid_value binary 0 NULL diff --git a/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.sh b/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.sh new file mode 100755 index 00000000000..5324496edd3 --- /dev/null +++ b/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.sh @@ -0,0 +1,115 @@ +#!/bin/bash + +# This script tests the MySQL compatibility of the SHOW COLUMNS command in ClickHouse +USER="default" +PASSWORD="" +HOST="127.0.0.1" +PORT=9004 + +# First run the clickhouse test to create the ClickHouse Tables + +echo "Drop tables if they exist" +${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS tab" +${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS database_123456789abcde" +${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS database_123456789abcde.tab" + +echo "Create tab table " +${CLICKHOUSE_LOCAL} --query " + CREATE TABLE tab + ( + uint64 UInt64, + int32 Nullable(Int32), + float32 Float32, + float64 Float64, + decimal_value Decimal(10, 2), + boolean_value UInt8, -- Use 0 for false, 1 for true + string_value String, + fixed_string_value FixedString(10), + date_value Date, + date32_value Date32, + datetime_value DateTime, + datetime64_value DateTime64(3), + json_value String, -- Store JSON as a string + uuid_value UUID, + enum_value Enum8('apple' = 1, 'banana' = 2, 'orange' = 3), + low_cardinality LowCardinality(String), + array_value Array(Int32), + map_value Map(String, Int32), + tuple_value Tuple(Int32, String), + nullable_value Nullable(Int32), + ipv4_value IPv4, + ipv6_value IPv6, + nested Nested + ( + nested_int Int32, + nested_string String + ) + ) ENGINE = MergeTree + ORDER BY uint64; + " + + +echo "Create pseudo-random database name" +${CLICKHOUSE_LOCAL} --query "CREATE DATABASE database_123456789abcde;" + +echo "Create tab duplicate table" +${CLICKHOUSE_LOCAL} --query " + CREATE TABLE database_123456789abcde.tab + ( + uint64 UInt64, + int32 Nullable(Int32), + float32 Float32, + float64 Float64, + decimal_value Decimal(10, 2), + boolean_value UInt8, -- Use 0 for false, 1 for true + string_value String, + fixed_string_value FixedString(10), + date_value Date, + date32_value Date32, + datetime_value DateTime, + datetime64_value DateTime64(3), + json_value String, -- Store JSON as a string + uuid_value UUID, + enum_value Enum8('apple' = 1, 'banana' = 2, 'orange' = 3), + low_cardinality LowCardinality(String), + array_value Array(Int32), + map_value Map(String, Int32), + tuple_value Tuple(Int32, String), + nullable_value Nullable(Int32), + ipv4_value IPv4, + ipv6_value IPv6, + nested Nested + ( + nested_int Int32, + nested_string String + ) + ) ENGINE = MergeTree + ORDER BY uint64; + " + +# Write sql to temp file +TEMP_FILE=$(mktemp) + +cat < $TEMP_FILE +SHOW COLUMNS FROM tab; +SHOW EXTENDED COLUMNS FROM tab; +SHOW FULL COLUMNS FROM tab; +SHOW COLUMNS FROM tab LIKE '%int%'; +SHOW COLUMNS FROM tab NOT LIKE '%int%'; +SHOW COLUMNS FROM tab ILIKE '%INT%'; +SHOW COLUMNS FROM tab NOT ILIKE '%INT%'; +SHOW COLUMNS FROM tab WHERE field LIKE '%int%'; +SHOW COLUMNS FROM tab LIMIT 1; +SHOW COLUMNS FROM tab; +SHOW COLUMNS FROM tab FROM database_123456789abcde; +SHOW COLUMNS FROM database_123456789abcde.tab; +DROP DATABASE database_123456789abcde; +DROP TABLE tab; +EOT + +# Now run the MySQL test script on the ClickHouse DB +echo "Run MySQL test" +mysql --user="$USER" --password="$PASSWORD" --host="$HOST" --port="$PORT" < $TEMP_FILE + +# Clean up the temp file +rm $TEMP_FILE From 1cc2499c70db4908331a02d1bbf71d89a27f8875 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Wed, 10 May 2023 16:10:43 +0000 Subject: [PATCH 0063/1072] fix build --- src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index fb25770ed8d..d956dac9fe0 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -49,7 +49,7 @@ CachedOnDiskReadBufferFromFile::CachedOnDiskReadBufferFromFile( bool allow_seeks_after_first_read_, bool use_external_buffer_, std::optional read_until_position_, - std::shared_ptr cache_log_)) + std::shared_ptr cache_log_) : ReadBufferFromFileBase(use_external_buffer_ ? 0 : settings_.remote_fs_buffer_size, nullptr, 0, file_size_) #ifndef NDEBUG , log(&Poco::Logger::get("CachedOnDiskReadBufferFromFile(" + source_file_path_ + ")")) From ac7c54a4d3636c8616799fa8272f6da1ce6683fc Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 10 May 2023 18:39:38 +0000 Subject: [PATCH 0064/1072] Refactor CapnProto format to improve input/output performance --- src/Core/Settings.h | 2 +- src/Core/SettingsEnums.cpp | 8 +- src/Core/SettingsEnums.h | 2 +- src/Formats/CapnProtoSchema.cpp | 298 ++++ .../{CapnProtoUtils.h => CapnProtoSchema.h} | 13 +- src/Formats/CapnProtoSerializer.cpp | 1218 +++++++++++++++++ src/Formats/CapnProtoSerializer.h | 25 + src/Formats/CapnProtoUtils.cpp | 734 ---------- src/Formats/FormatSettings.h | 6 +- .../Formats/Impl/CapnProtoRowInputFormat.cpp | 253 +--- .../Formats/Impl/CapnProtoRowInputFormat.h | 9 +- .../Formats/Impl/CapnProtoRowOutputFormat.cpp | 266 +--- .../Formats/Impl/CapnProtoRowOutputFormat.h | 17 +- .../Formats/Impl/ProtobufListInputFormat.cpp | 9 +- .../Formats/Impl/ProtobufRowInputFormat.cpp | 9 +- .../queries/0_stateless/02030_capnp_format.sh | 4 +- ...p_case_insensitive_names_matcing.reference | 1 + ...35_capnp_case_insensitive_names_matcing.sh | 10 + ...ing_and_writing_structure_fields.reference | 3 + ...36_reading_and_writing_structure_fields.sh | 24 + ...2735_case_insensitive_names_matching.capnp | 13 + .../02736_nested_structures.capnp | 21 + 22 files changed, 1686 insertions(+), 1259 deletions(-) create mode 100644 src/Formats/CapnProtoSchema.cpp rename src/Formats/{CapnProtoUtils.h => CapnProtoSchema.h} (59%) create mode 100644 src/Formats/CapnProtoSerializer.cpp create mode 100644 src/Formats/CapnProtoSerializer.h delete mode 100644 src/Formats/CapnProtoUtils.cpp create mode 100644 tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.reference create mode 100755 tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.sh create mode 100644 tests/queries/0_stateless/02736_reading_and_writing_structure_fields.reference create mode 100755 tests/queries/0_stateless/02736_reading_and_writing_structure_fields.sh create mode 100644 tests/queries/0_stateless/format_schemas/02735_case_insensitive_names_matching.capnp create mode 100644 tests/queries/0_stateless/format_schemas/02736_nested_structures.capnp diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 1bea2c26392..269fa832f45 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -958,7 +958,7 @@ class IColumn; M(Bool, output_format_orc_string_as_string, false, "Use ORC String type instead of Binary for String columns", 0) \ M(ORCCompression, output_format_orc_compression_method, "lz4", "Compression method for ORC output format. Supported codecs: lz4, snappy, zlib, zstd, none (uncompressed)", 0) \ \ - M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \ + M(CapnProtoEnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::CapnProtoEnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \ \ M(String, input_format_mysql_dump_table_name, "", "Name of the table in MySQL dump from which to read data", 0) \ M(Bool, input_format_mysql_dump_map_column_names, true, "Match columns from table in MySQL dump and columns from ClickHouse table by names", 0) \ diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp index e0f16ea00db..a291a23c140 100644 --- a/src/Core/SettingsEnums.cpp +++ b/src/Core/SettingsEnums.cpp @@ -144,10 +144,10 @@ IMPLEMENT_SETTING_ENUM(TransactionsWaitCSNMode, ErrorCodes::BAD_ARGUMENTS, {"wait", TransactionsWaitCSNMode::WAIT}, {"wait_unknown", TransactionsWaitCSNMode::WAIT_UNKNOWN}}) -IMPLEMENT_SETTING_ENUM(EnumComparingMode, ErrorCodes::BAD_ARGUMENTS, - {{"by_names", FormatSettings::EnumComparingMode::BY_NAMES}, - {"by_values", FormatSettings::EnumComparingMode::BY_VALUES}, - {"by_names_case_insensitive", FormatSettings::EnumComparingMode::BY_NAMES_CASE_INSENSITIVE}}) +IMPLEMENT_SETTING_ENUM(CapnProtoEnumComparingMode, ErrorCodes::BAD_ARGUMENTS, + {{"by_names", FormatSettings::CapnProtoEnumComparingMode::BY_NAMES}, + {"by_values", FormatSettings::CapnProtoEnumComparingMode::BY_VALUES}, + {"by_names_case_insensitive", FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE}}) IMPLEMENT_SETTING_ENUM(EscapingRule, ErrorCodes::BAD_ARGUMENTS, {{"None", FormatSettings::EscapingRule::None}, diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h index 3ae7bfaa673..1c5be910ef7 100644 --- a/src/Core/SettingsEnums.h +++ b/src/Core/SettingsEnums.h @@ -188,7 +188,7 @@ enum class TransactionsWaitCSNMode DECLARE_SETTING_ENUM(TransactionsWaitCSNMode) -DECLARE_SETTING_ENUM_WITH_RENAME(EnumComparingMode, FormatSettings::EnumComparingMode) +DECLARE_SETTING_ENUM_WITH_RENAME(CapnProtoEnumComparingMode, FormatSettings::CapnProtoEnumComparingMode) DECLARE_SETTING_ENUM_WITH_RENAME(EscapingRule, FormatSettings::EscapingRule) diff --git a/src/Formats/CapnProtoSchema.cpp b/src/Formats/CapnProtoSchema.cpp new file mode 100644 index 00000000000..22518d5061a --- /dev/null +++ b/src/Formats/CapnProtoSchema.cpp @@ -0,0 +1,298 @@ +#include + +#if USE_CAPNP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_PARSE_CAPN_PROTO_SCHEMA; + extern const int BAD_TYPE_OF_FIELD; + extern const int FILE_DOESNT_EXIST; + extern const int UNKNOWN_EXCEPTION; + extern const int CAPN_PROTO_BAD_TYPE; + extern const int BAD_ARGUMENTS; +} + +capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaInfo & schema_info) +{ + capnp::ParsedSchema schema; + try + { + int fd; + KJ_SYSCALL(fd = open(schema_info.schemaDirectory().data(), O_RDONLY)); // NOLINT(bugprone-suspicious-semicolon) + auto schema_dir = kj::newDiskDirectory(kj::OsFileHandle(fd)); + schema = impl.parseFromDirectory(*schema_dir, kj::Path::parse(schema_info.schemaPath()), {}); + } + catch (const kj::Exception & e) + { + /// That's not good to determine the type of error by its description, but + /// this is the only way to do it here, because kj doesn't specify the type of error. + auto description = std::string_view(e.getDescription().cStr()); + if (description.find("No such file or directory") != String::npos || description.find("no such directory") != String::npos) + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot open CapnProto schema, file {} doesn't exists", schema_info.absoluteSchemaPath()); + + if (description.find("Parse error") != String::npos) + throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA, "Cannot parse CapnProto schema {}:{}", schema_info.schemaPath(), e.getLine()); + + throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, + "Unknown exception while parsing CapnProto schema: {}, schema dir and file: {}, {}", + description, schema_info.schemaDirectory(), schema_info.schemaPath()); + } + + auto message_maybe = schema.findNested(schema_info.messageName()); + auto * message_schema = kj::_::readMaybe(message_maybe); + if (!message_schema) + throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA, + "CapnProto schema doesn't contain message with name {}", schema_info.messageName()); + return message_schema->asStruct(); +} + +bool checkIfStructContainsUnnamedUnion(const capnp::StructSchema & struct_schema) +{ + return struct_schema.getFields().size() != struct_schema.getNonUnionFields().size(); +} + +bool checkIfStructIsNamedUnion(const capnp::StructSchema & struct_schema) +{ + return struct_schema.getFields().size() == struct_schema.getUnionFields().size(); +} + +/// Get full name of type for better exception messages. +String getCapnProtoFullTypeName(const capnp::Type & type) +{ + static const std::map capnp_simple_type_names = + { + {capnp::schema::Type::Which::BOOL, "Bool"}, + {capnp::schema::Type::Which::VOID, "Void"}, + {capnp::schema::Type::Which::INT8, "Int8"}, + {capnp::schema::Type::Which::INT16, "Int16"}, + {capnp::schema::Type::Which::INT32, "Int32"}, + {capnp::schema::Type::Which::INT64, "Int64"}, + {capnp::schema::Type::Which::UINT8, "UInt8"}, + {capnp::schema::Type::Which::UINT16, "UInt16"}, + {capnp::schema::Type::Which::UINT32, "UInt32"}, + {capnp::schema::Type::Which::UINT64, "UInt64"}, + {capnp::schema::Type::Which::FLOAT32, "Float32"}, + {capnp::schema::Type::Which::FLOAT64, "Float64"}, + {capnp::schema::Type::Which::TEXT, "Text"}, + {capnp::schema::Type::Which::DATA, "Data"}, + {capnp::schema::Type::Which::INTERFACE, "Interface"}, + {capnp::schema::Type::Which::ANY_POINTER, "AnyPointer"}, + }; + + switch (type.which()) + { + case capnp::schema::Type::Which::STRUCT: + { + auto struct_schema = type.asStruct(); + + auto non_union_fields = struct_schema.getNonUnionFields(); + std::vector non_union_field_names; + for (auto nested_field : non_union_fields) + non_union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType())); + + auto union_fields = struct_schema.getUnionFields(); + std::vector union_field_names; + for (auto nested_field : union_fields) + union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType())); + + String union_name = "Union(" + boost::algorithm::join(union_field_names, ", ") + ")"; + /// Check if the struct is a named union. + if (non_union_field_names.empty()) + return union_name; + + String type_name = "Struct(" + boost::algorithm::join(non_union_field_names, ", "); + /// Check if the struct contains unnamed union. + if (!union_field_names.empty()) + type_name += ", " + union_name; + type_name += ")"; + return type_name; + } + case capnp::schema::Type::Which::LIST: + return "List(" + getCapnProtoFullTypeName(type.asList().getElementType()) + ")"; + case capnp::schema::Type::Which::ENUM: + { + auto enum_schema = type.asEnum(); + String enum_name = "Enum("; + auto enumerants = enum_schema.getEnumerants(); + for (unsigned i = 0; i != enumerants.size(); ++i) + { + enum_name += String(enumerants[i].getProto().getName()) + " = " + std::to_string(enumerants[i].getOrdinal()); + if (i + 1 != enumerants.size()) + enum_name += ", "; + } + enum_name += ")"; + return enum_name; + } + default: + auto it = capnp_simple_type_names.find(type.which()); + if (it == capnp_simple_type_names.end()) + throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unknown CapnProto type"); + return it->second; + } +} + +namespace +{ + + template + static DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants) + { + std::vector> values; + for (auto enumerant : enumerants) + values.emplace_back(enumerant.getProto().getName(), ValueType(enumerant.getOrdinal())); + return std::make_shared>(std::move(values)); + } + + static DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema) + { + auto enumerants = enum_schema.getEnumerants(); + if (enumerants.size() < 128) + return getEnumDataTypeFromEnumerants(enumerants); + if (enumerants.size() < 32768) + return getEnumDataTypeFromEnumerants(enumerants); + + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "ClickHouse supports only 8 and 16-bit Enums"); + } + + static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type, bool skip_unsupported_fields) + { + switch (capnp_type.which()) + { + case capnp::schema::Type::INT8: + return std::make_shared(); + case capnp::schema::Type::INT16: + return std::make_shared(); + case capnp::schema::Type::INT32: + return std::make_shared(); + case capnp::schema::Type::INT64: + return std::make_shared(); + case capnp::schema::Type::BOOL: [[fallthrough]]; + case capnp::schema::Type::UINT8: + return std::make_shared(); + case capnp::schema::Type::UINT16: + return std::make_shared(); + case capnp::schema::Type::UINT32: + return std::make_shared(); + case capnp::schema::Type::UINT64: + return std::make_shared(); + case capnp::schema::Type::FLOAT32: + return std::make_shared(); + case capnp::schema::Type::FLOAT64: + return std::make_shared(); + case capnp::schema::Type::DATA: [[fallthrough]]; + case capnp::schema::Type::TEXT: + return std::make_shared(); + case capnp::schema::Type::ENUM: + return getEnumDataTypeFromEnumSchema(capnp_type.asEnum()); + case capnp::schema::Type::LIST: + { + auto list_schema = capnp_type.asList(); + auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType(), skip_unsupported_fields); + if (!nested_type) + return nullptr; + return std::make_shared(nested_type); + } + case capnp::schema::Type::STRUCT: + { + auto struct_schema = capnp_type.asStruct(); + + + if (struct_schema.getFields().size() == 0) + { + if (skip_unsupported_fields) + return nullptr; + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Empty messages are not supported"); + } + + /// Check if it can be Nullable. + if (checkIfStructIsNamedUnion(struct_schema)) + { + auto fields = struct_schema.getUnionFields(); + if (fields.size() != 2 || (!fields[0].getType().isVoid() && !fields[1].getType().isVoid())) + { + if (skip_unsupported_fields) + return nullptr; + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unions are not supported"); + } + auto value_type = fields[0].getType().isVoid() ? fields[1].getType() : fields[0].getType(); + if (value_type.isStruct() || value_type.isList()) + { + if (skip_unsupported_fields) + return nullptr; + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Tuples and Lists cannot be inside Nullable"); + } + + auto nested_type = getDataTypeFromCapnProtoType(value_type, skip_unsupported_fields); + if (!nested_type) + return nullptr; + return std::make_shared(nested_type); + } + + if (checkIfStructContainsUnnamedUnion(struct_schema)) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); + + /// Treat Struct as Tuple. + DataTypes nested_types; + Names nested_names; + for (auto field : struct_schema.getNonUnionFields()) + { + auto nested_type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields); + if (!nested_type) + continue; + nested_names.push_back(field.getProto().getName()); + nested_types.push_back(nested_type); + } + if (nested_types.empty()) + return nullptr; + return std::make_shared(std::move(nested_types), std::move(nested_names)); + } + default: + { + if (skip_unsupported_fields) + return nullptr; + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unsupported CapnProtoType: {}", getCapnProtoFullTypeName(capnp_type)); + } + } +} + +} + +NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields) +{ + if (checkIfStructContainsUnnamedUnion(schema)) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); + + NamesAndTypesList names_and_types; + for (auto field : schema.getNonUnionFields()) + { + auto name = field.getProto().getName(); + auto type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields); + if (type) + names_and_types.emplace_back(name, type); + } + if (names_and_types.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Cannot convert CapnProto schema to ClickHouse table schema, all fields have unsupported types"); + + return names_and_types; +} + +} + +#endif diff --git a/src/Formats/CapnProtoUtils.h b/src/Formats/CapnProtoSchema.h similarity index 59% rename from src/Formats/CapnProtoUtils.h rename to src/Formats/CapnProtoSchema.h index 2d8cdb418d7..225f6f56207 100644 --- a/src/Formats/CapnProtoUtils.h +++ b/src/Formats/CapnProtoSchema.h @@ -30,17 +30,14 @@ public: capnp::StructSchema getMessageSchema(const FormatSchemaInfo & schema_info); }; -std::pair splitCapnProtoFieldName(const String & name); +bool checkIfStructContainsUnnamedUnion(const capnp::StructSchema & struct_schema); +bool checkIfStructIsNamedUnion(const capnp::StructSchema & struct_schema); -bool compareEnumNames(const String & first, const String & second, FormatSettings::EnumComparingMode mode); - -std::pair getStructBuilderAndFieldByColumnName(capnp::DynamicStruct::Builder struct_builder, const String & name); - -capnp::DynamicValue::Reader getReaderByColumnName(const capnp::DynamicStruct::Reader & struct_reader, const String & name); - -void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Block & header, FormatSettings::EnumComparingMode mode); +/// Get full name of type for better exception messages. +String getCapnProtoFullTypeName(const capnp::Type & type); NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields); + } #endif diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp new file mode 100644 index 00000000000..e0c8ae2a79a --- /dev/null +++ b/src/Formats/CapnProtoSerializer.cpp @@ -0,0 +1,1218 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int THERE_IS_NO_COLUMN; + extern const int BAD_TYPE_OF_FIELD; + extern const int CAPN_PROTO_BAD_CAST; + extern const int INCORRECT_DATA; + extern const int ILLEGAL_COLUMN; +} + +namespace +{ + std::pair splitFieldName(const String & name) + { + const auto * begin = name.data(); + const auto * end = name.data() + name.size(); + const auto * it = find_first_symbols<'_', '.'>(begin, end); + String first = String(begin, it); + String second = it == end ? "" : String(it + 1, end); + return {first, second}; + } + + std::optional findFieldByName(const capnp::StructSchema & struct_schema, const String & name) + { + const auto & fields = struct_schema.getFields(); + for (auto field : fields) + { + auto field_name = String(field.getProto().getName()); + if (boost::to_lower_copy(name) == boost::to_lower_copy(field_name)) + return field; + } + return std::nullopt; + } + + [[noreturn]] void throwCannotConvert(const DataTypePtr & type, const String & name, const capnp::Type & capnp_type) + { + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\" with type {} to CapnProto type {}", + name, + type->getName(), + getCapnProtoFullTypeName(capnp_type)); + } + + struct FieldBuilder + { + virtual ~FieldBuilder() = default; + }; + + struct ListBuilder : public FieldBuilder + { + explicit ListBuilder(capnp::DynamicValue::Builder builder) : impl(builder.as()) + { + } + + capnp::DynamicList::Builder impl; + std::vector> nested_builders; + }; + + struct StructBuilder : public FieldBuilder + { + explicit StructBuilder(capnp::DynamicValue::Builder builder, size_t fields_size) : impl(builder.as()), field_builders(fields_size) + { + } + + explicit StructBuilder(capnp::DynamicStruct::Builder struct_builder, size_t fields_size) : impl(std::move(struct_builder)), field_builders(fields_size) + { + } + + capnp::DynamicStruct::Builder impl; + std::vector> field_builders; + }; + + std::unique_ptr initStructFieldBuilderIfNeeded(const ColumnPtr & column, size_t row_num, capnp::DynamicStruct::Builder & struct_builder, const capnp::StructSchema::Field & field, const capnp::Type & capnp_type, size_t nested_fields_size) + { + switch (capnp_type.which()) + { + case capnp::schema::Type::LIST: + { + const auto * array_column = assert_cast(column.get()); + size_t size = array_column->getOffsets()[row_num] - array_column->getOffsets()[row_num - 1]; + return std::make_unique(struct_builder.init(field, static_cast(size))); + } + case capnp::schema::Type::STRUCT: + { + return std::make_unique(struct_builder.init(field), nested_fields_size); + } + default: + return nullptr; + } + } + + class ICapnProtoSerializer + { + public: + virtual std::optional writeRow(const ColumnPtr & column, FieldBuilder * builder, size_t row_num) = 0; + virtual void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) = 0; + + virtual ~ICapnProtoSerializer() = default; + }; + + template + class CapnProtoIntegerSerializer : public ICapnProtoSerializer + { + public: + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::INT) + return capnp::DynamicValue::Reader(column->getInt(row_num)); + if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::UINT) + return capnp::DynamicValue::Reader(column->getUInt(row_num)); + return capnp::DynamicValue::Reader(column->getBool(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + NumericType value; + if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::INT) + value = static_cast(reader.as()); + else if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::UINT) + value = static_cast(reader.as()); + else if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::BOOL) + value = static_cast(reader.as()); + + if constexpr (is_bool_data_type) + assert_cast(column).insertValue(static_cast(value)); + else + assert_cast &>(column).insertValue(value); + } + }; + + template + static std::unique_ptr createIntegerSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + switch (capnp_type.which()) + { + case capnp::schema::Type::INT8: [[fallthrough]]; + case capnp::schema::Type::INT16: [[fallthrough]]; + case capnp::schema::Type::INT32: [[fallthrough]]; + case capnp::schema::Type::INT64: + return std::make_unique>(); + case capnp::schema::Type::UINT8: [[fallthrough]]; + case capnp::schema::Type::UINT16: [[fallthrough]]; + case capnp::schema::Type::UINT32: [[fallthrough]]; + case capnp::schema::Type::UINT64: + return std::make_unique>(); + case capnp::schema::Type::BOOL: + return std::make_unique>(); + default: + throwCannotConvert(data_type, column_name, capnp_type); + } + } + + template + class CapnProtoBigIntegerSerializer : public ICapnProtoSerializer + { + public: + CapnProtoBigIntegerSerializer(const DataTypePtr & data_type_, const String & column_name, const capnp::Type & capnp_type) : data_type(data_type_) + { + if (!capnp_type.isData()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto value = reader.as(); + if (value.size() != sizeof(NumericType)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", data_type->getName(), value.size()); + + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + + private: + DataTypePtr data_type; + }; + + template + class CapnProtoFloatSerializer : public ICapnProtoSerializer + { + public: + CapnProtoFloatSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isFloat32() && !capnp_type.isFloat64()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getFloat64(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast &>(column).insertValue(reader.as()); + } + }; + + template + class CapnProtoEnumSerializer : public ICapnProtoSerializer + { + public: + CapnProtoEnumSerializer( + const DataTypePtr & data_type_, + const String & column_name, + const capnp::Type & capnp_type, + const FormatSettings::CapnProtoEnumComparingMode enum_comparing_mode_) : data_type(data_type_), enum_comparing_mode(enum_comparing_mode_) + { + if (!capnp_type.isEnum()) + throwCannotConvert(data_type, column_name, capnp_type); + + bool to_lower = enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE; + const auto * enum_type = assert_cast *>(data_type.get()); + const auto & enum_values = dynamic_cast &>(*enum_type); + + enum_schema = capnp_type.asEnum(); + auto enumerants = enum_schema.getEnumerants(); + constexpr auto max_value = std::is_same_v ? INT8_MAX : INT16_MAX; + if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) + { + /// In CapnProto Enum fields are numbered sequentially starting from zero. + if (enumerants.size() > max_value) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Enum from CapnProto schema contains values that are out of range for Clickhouse enum type {}", + data_type->getName()); + + auto values = enum_values.getSetOfAllValues(); + std::unordered_set capn_enum_values; + for (auto enumerant : enumerants) + capn_enum_values.insert(EnumType(enumerant.getOrdinal())); + if (values != capn_enum_values) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "The set of values in Enum from CapnProto schema is different from the set of values in ClickHouse Enum"); + } + else + { + auto names = enum_values.getSetOfAllNames(to_lower); + std::unordered_set capn_enum_names; + + for (auto enumerant : enumerants) + { + String name = enumerant.getProto().getName(); + capn_enum_names.insert(to_lower ? boost::algorithm::to_lower_copy(name) : name); + } + + if (names != capn_enum_names) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "The set of names in Enum from CapnProto schema is different from the set of names in ClickHouse Enum"); + } + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + const auto * enum_data_type = assert_cast *>(data_type.get()); + EnumType enum_value = assert_cast &>(*column).getElement(row_num); + if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) + return capnp::DynamicValue::Reader(capnp::DynamicEnum(enum_schema, enum_value)); + + auto enum_name = enum_data_type->getNameForValue(enum_value); + for (const auto enumerant : enum_schema.getEnumerants()) + { + if (compareEnumNames(String(enum_name), enumerant.getProto().getName(), enum_comparing_mode)) + return capnp::DynamicValue::Reader(capnp::DynamicEnum(enumerant)); + } + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot convert CLickHouse Enum value to CapnProto Enum"); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto enum_value = reader.as(); + auto enumerant = *kj::_::readMaybe(enum_value.getEnumerant()); + auto enum_type = assert_cast *>(data_type.get()); + DataTypePtr nested_type = std::make_shared>(); + switch (enum_comparing_mode) + { + case FormatSettings::CapnProtoEnumComparingMode::BY_VALUES: + { + assert_cast &>(column).insertValue(static_cast(enumerant.getOrdinal())); + return; + } + case FormatSettings::CapnProtoEnumComparingMode::BY_NAMES: + { + auto value = enum_type->getValue(String(enumerant.getProto().getName())); + assert_cast &>(column).insertValue(value); + return; + } + case FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE: + { + /// Find the same enum name case insensitive. + String enum_name = enumerant.getProto().getName(); + for (auto & name : enum_type->getAllRegisteredNames()) + { + if (compareEnumNames(name, enum_name, enum_comparing_mode)) + { + assert_cast &>(column).insertValue(enum_type->getValue(name)); + break; + } + } + return; + } + } + } + + private: + bool compareEnumNames(const String & first, const String & second, const FormatSettings::CapnProtoEnumComparingMode mode) + { + if (mode == FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE) + return boost::algorithm::to_lower_copy(first) == boost::algorithm::to_lower_copy(second); + return first == second; + } + + DataTypePtr data_type; + capnp::EnumSchema enum_schema; + const FormatSettings::CapnProtoEnumComparingMode enum_comparing_mode; + }; + + class CapnProtoDateSerializer : public ICapnProtoSerializer + { + public: + CapnProtoDateSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isUInt16()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getUInt(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast(column).insertValue(reader.as()); + } + }; + + class CapnProtoDate32Serializer : public ICapnProtoSerializer + { + public: + CapnProtoDate32Serializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isInt32()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getInt(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast(column).insertValue(reader.as()); + } + }; + + class CapnProtoDateTimeSerializer : public ICapnProtoSerializer + { + public: + CapnProtoDateTimeSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isUInt32()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getInt(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast(column).insertValue(reader.as()); + } + }; + + class CapnProtoDateTime64Serializer : public ICapnProtoSerializer + { + public: + CapnProtoDateTime64Serializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isInt64()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getInt(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast(column).insertValue(reader.as()); + } + }; + + template + class CapnProtoDecimalSerializer : public ICapnProtoSerializer + { + public: + CapnProtoDecimalSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + auto which = WhichDataType(data_type); + if ((!capnp_type.isInt32() && which.isDecimal32()) || (!capnp_type.isInt64() && which.isDecimal64())) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getInt(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast &>(column).insertValue(reader.as()); + } + }; + + template + class CapnProtoBigDecimalSerializer : public ICapnProtoSerializer + { + public: + CapnProtoBigDecimalSerializer(const DataTypePtr & data_type_, const String & column_name, const capnp::Type & capnp_type) : data_type(data_type_) + { + if (!capnp_type.isData()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto value = reader.as(); + if (value.size() != sizeof(DecimalType)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", data_type->getName(), value.size()); + + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + + private: + DataTypePtr data_type; + }; + + template + class CapnProtoStringSerializer : public ICapnProtoSerializer + { + public: + CapnProtoStringSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type_) : capnp_type(capnp_type_) + { + if (!capnp_type.isData() && !capnp_type.isText()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + + if constexpr (is_binary) + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + + /// For type TEXT data must be null-terminated, but in String column we always have 0 byte at the end of each value. + return capnp::DynamicValue::Reader(capnp::Text::Reader(data.data, data.size)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + if constexpr (is_binary) + { + auto value = reader.as(); + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + else + { + auto value = reader.as(); + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + } + + private: + capnp::Type capnp_type; + }; + + template + class CapnProtoFixedStringSerializer : public ICapnProtoSerializer + { + public: + CapnProtoFixedStringSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type_) : capnp_type(capnp_type_) + { + if (!capnp_type.isData() && !capnp_type.isText()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + if constexpr (is_binary) + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + + if (data.data[data.size - 1] == 0) + return capnp::DynamicValue::Reader(capnp::Text::Reader(reinterpret_cast(data.data), data.size)); + + /// In TEXT type data should be null-terminated, but ClickHouse FixedString data could not be. + /// To make data null-terminated we should copy it to temporary String object and use it in capnp::Text::Reader. + /// Note that capnp::Text::Reader works only with pointer to the data and it's size, so we should + /// guarantee that new String object life time is longer than capnp::Text::Reader life time. + tmp_string = data.toString(); + return capnp::DynamicValue::Reader(capnp::Text::Reader(reinterpret_cast(tmp_string.data()), tmp_string.size())); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto & fixed_string_column = assert_cast(column); + if constexpr (is_binary) + { + auto value = reader.as(); + if (value.size() > fixed_string_column.getN()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot read data with size {} to FixedString with size {}", value.size(), fixed_string_column.getN()); + + fixed_string_column.insertData(reinterpret_cast(value.begin()), value.size()); + } + else + { + auto value = reader.as(); + if (value.size() > fixed_string_column.getN()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot read data with size {} to FixedString with size {}", value.size(), fixed_string_column.getN()); + + fixed_string_column.insertData(reinterpret_cast(value.begin()), value.size()); + } + } + + private: + String tmp_string; + capnp::Type capnp_type; + }; + + class CapnProtoIPv4Serializer : public ICapnProtoSerializer + { + public: + CapnProtoIPv4Serializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isUInt32()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(assert_cast(*column).getElement(row_num).toUnderType()); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast(column).insertValue(IPv4(reader.as())); + } + }; + + class CapnProtoIPv6Serializer : public ICapnProtoSerializer + { + public: + CapnProtoIPv6Serializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isData()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto value = reader.as(); + if (value.size() != sizeof(IPv6)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of IPv6 value: {}", value.size()); + + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + }; + + class CapnProtoUUIDSerializer : public ICapnProtoSerializer + { + public: + CapnProtoUUIDSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isData()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto value = reader.as(); + if (value.size() != sizeof(UUID)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of UUID value: {}", value.size()); + + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + }; + + std::unique_ptr createSerializer(const DataTypePtr & type, const String & name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings); + + class CapnProtoLowCardinalitySerializer : public ICapnProtoSerializer + { + public: + CapnProtoLowCardinalitySerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + nested_serializer = createSerializer(assert_cast(*data_type).getDictionaryType(), column_name, capnp_type, settings); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder * field_builder, size_t row_num) override + { + const auto & low_cardinality_column = assert_cast(*column); + size_t index = low_cardinality_column.getIndexAt(row_num); + const auto & dict_column = low_cardinality_column.getDictionary().getNestedColumn(); + return nested_serializer->writeRow(dict_column, field_builder, index); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto & low_cardinality_column = assert_cast(column); + auto tmp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); + nested_serializer->readRow(*tmp_column, reader); + low_cardinality_column.insertFromFullColumn(*tmp_column, 0); + } + + private: + std::unique_ptr nested_serializer; + }; + + class CapnProtoNullableSerializer : public ICapnProtoSerializer + { + public: + CapnProtoNullableSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + if (!capnp_type.isStruct()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert column \"{}\": Nullable can be represented only as a named union of type Void and nested type, got CapnProto type {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + + /// Check that struct is a named union of type VOID and one arbitrary type. + auto struct_schema = capnp_type.asStruct(); + if (!checkIfStructIsNamedUnion(struct_schema)) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert column \"{}\": Nullable can be represented only as a named union of type Void and nested type." + "Given CapnProto struct is not a named union: {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + + auto union_fields = struct_schema.getUnionFields(); + if (union_fields.size() != 2) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert column \"{}\": Nullable can be represented only as a named union of type Void and nested type." + "Given CapnProto union have more than 2 fields: {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + + auto first = union_fields[0]; + auto second = union_fields[1]; + auto nested_type = assert_cast(data_type.get())->getNestedType(); + if (first.getType().isVoid()) + { + null_field = first; + nested_field = second; + nested_capnp_type = second.getType(); + if (nested_capnp_type.isStruct()) + nested_fields_size = nested_capnp_type.asStruct().getFields().size(); + nested_serializer = createSerializer(nested_type, column_name, nested_capnp_type, settings); + } + else if (second.getType().isVoid()) + { + null_field = second; + nested_field = first; + nested_capnp_type = first.getType(); + if (nested_capnp_type.isStruct()) + nested_fields_size = nested_capnp_type.asStruct().getFields().size(); + nested_serializer = createSerializer(nested_type, column_name, nested_capnp_type, settings); + } + else + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert column \"{}\": Nullable can be represented only as a named union of type Void and nested type." + "Given CapnProto union doesn't have field with type Void: {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder * field_builder, size_t row_num) override + { + assert(field_builder); + auto & struct_builder = assert_cast(*field_builder); + const auto & nullable_column = assert_cast(*column); + if (nullable_column.isNullAt(row_num)) + { + struct_builder.impl.set(null_field, capnp::Void()); + } + else + { + struct_builder.impl.clear(nested_field); + const auto & nested_column = nullable_column.getNestedColumnPtr(); + auto nested_field_builder = initStructFieldBuilderIfNeeded(nested_column, row_num, struct_builder.impl, nested_field, nested_capnp_type, nested_fields_size); + auto value = nested_serializer->writeRow(nested_column, nested_field_builder.get(), row_num); + if (value) + struct_builder.impl.set(nested_field, *value); + } + + return std::nullopt; + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto struct_reader = reader.as(); + auto & nullable_column = assert_cast(column); + auto field = *kj::_::readMaybe(struct_reader.which()); + if (field.getType().isVoid()) + nullable_column.insertDefault(); + else + { + auto & nested_column = nullable_column.getNestedColumn(); + auto nested_reader = struct_reader.get(field); + nested_serializer->readRow(nested_column, nested_reader); + nullable_column.getNullMapData().push_back(0); + } + } + + private: + std::unique_ptr nested_serializer; + capnp::StructSchema::Field null_field; + capnp::StructSchema::Field nested_field; + size_t nested_fields_size = 0; + capnp::Type nested_capnp_type; + }; + + class CapnProtoArraySerializer : public ICapnProtoSerializer + { + public: + CapnProtoArraySerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + if (!capnp_type.isList()) + throwCannotConvert(data_type, column_name, capnp_type); + + auto nested_type = assert_cast(data_type.get())->getNestedType(); + element_type = capnp_type.asList().getElementType(); + if (element_type.isStruct()) + element_struct_fields = element_type.asStruct().getFields().size(); + nested_serializer = createSerializer(nested_type, column_name, capnp_type.asList().getElementType(), settings); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder * field_builder, size_t row_num) override + { + assert(field_builder); + auto & list_builder = assert_cast(*field_builder); + const auto * array_column = assert_cast(column.get()); + const auto & nested_column = array_column->getDataPtr(); + const auto & offsets = array_column->getOffsets(); + auto offset = offsets[row_num - 1]; + size_t size = offsets[row_num] - offset; + bool need_nested_builders = list_builder.nested_builders.empty(); + for (unsigned i = 0; i != static_cast(size); ++i) + { + if (need_nested_builders) + { + /// For nested lists we need to initialize nested list builder. + if (element_type.isList()) + { + const auto & nested_offset = checkAndGetColumn(*nested_column)->getOffsets(); + size_t nested_array_size = nested_offset[offset + i] - nested_offset[offset + i - 1]; + list_builder.nested_builders.emplace_back(std::make_unique(list_builder.impl.init(i, static_cast(nested_array_size)))); + } + else if (element_type.isStruct()) + { + list_builder.nested_builders.emplace_back(std::make_unique(list_builder.impl[i], element_struct_fields)); + } + else + { + list_builder.nested_builders.emplace_back(); + } + } + + auto value = nested_serializer->writeRow(nested_column, list_builder.nested_builders[i].get(), offset + i); + if (value) + list_builder.impl.set(i, *value); + } + + return std::nullopt; + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto list_reader = reader.as(); + auto & column_array = assert_cast(column); + auto & offsets = column_array.getOffsets(); + offsets.push_back(offsets.back() + list_reader.size()); + + auto & nested_column = column_array.getData(); + for (const auto & nested_reader : list_reader) + nested_serializer->readRow(nested_column, nested_reader); + } + + private: + std::unique_ptr nested_serializer; + capnp::Type element_type; + size_t element_struct_fields; + }; + + class CapnProtoMapSerializer : public ICapnProtoSerializer + { + public: + CapnProtoMapSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + /// We output/input Map type as follow CapnProto schema + /// + /// struct Map { + /// struct Entry { + /// key @0: Key; + /// value @1: Value; + /// } + /// entries @0 :List(Entry); + /// } + + if (!capnp_type.isStruct()) + throwCannotConvert(data_type, column_name, capnp_type); + + auto struct_schema = capnp_type.asStruct(); + + if (checkIfStructContainsUnnamedUnion(struct_schema)) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\" with type {} to CapnProto Struct with unnamed union {}", + column_name, + data_type->getName(), + getCapnProtoFullTypeName(capnp_type)); + + if (struct_schema.getFields().size() != 1) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": Map type can be represented as a Struct with one list field, got struct: {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + + const auto & field_type = struct_schema.getFields()[0].getType(); + if (!field_type.isList()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": Map type can be represented as a Struct with one list field, got field: {}", + column_name, + getCapnProtoFullTypeName(field_type)); + + auto list_element_type = field_type.asList().getElementType(); + if (!list_element_type.isStruct()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": Field of struct that represents Map should be a list of structs, got list of {}", + column_name, + getCapnProtoFullTypeName(list_element_type)); + + auto key_value_struct = list_element_type.asStruct(); + if (checkIfStructContainsUnnamedUnion(key_value_struct)) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": struct that represents Map entries is unnamed union: {}", + column_name, + getCapnProtoFullTypeName(list_element_type)); + + if (key_value_struct.getFields().size() != 2) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": struct that represents Map entries should contain only 2 fields, got struct {}", + column_name, + getCapnProtoFullTypeName(list_element_type)); + + const auto & map_type = assert_cast(*data_type); + DataTypes types = {map_type.getKeyType(), map_type.getValueType()}; + Names names = {"key", "value"}; + auto entries_type = std::make_shared(std::make_shared(types, names)); + entries_field = struct_schema.getFields()[0]; + entries_capnp_type = entries_field.getType(); + nested_serializer = createSerializer(entries_type, column_name, field_type, settings); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder * field_builder, size_t row_num) override + { + assert(field_builder); + auto & struct_builder = assert_cast(*field_builder); + const auto & entries_column = assert_cast(column.get())->getNestedColumnPtr(); + auto entries_builder = initStructFieldBuilderIfNeeded(entries_column, row_num, struct_builder.impl, entries_field, entries_capnp_type, 0); + nested_serializer->writeRow(entries_column, entries_builder.get(), row_num); + return std::nullopt; + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto struct_reader = reader.as(); + auto & entries_column = assert_cast(column).getNestedColumn(); + nested_serializer->readRow(entries_column, struct_reader.get(entries_field)); + } + + private: + std::unique_ptr nested_serializer; + capnp::StructSchema::Field entries_field; + capnp::Type entries_capnp_type; + }; + + class CapnProtoStructureSerializer : public ICapnProtoSerializer + { + public: + CapnProtoStructureSerializer(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) + { + if (checkIfStructIsNamedUnion(schema) || checkIfStructContainsUnnamedUnion(schema)) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Root CapnProto Struct cannot be named union/struct with unnamed union"); + + initialize(data_types, names, schema, settings); + } + + CapnProtoStructureSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + if (!capnp_type.isStruct()) + throwCannotConvert(data_type, column_name, capnp_type); + + auto struct_schema = capnp_type.asStruct(); + + if (checkIfStructIsNamedUnion(struct_schema) || checkIfStructContainsUnnamedUnion(struct_schema)) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\" with type {} to CapnProto named union/struct with unnamed union {}", + column_name, + data_type->getName(), + getCapnProtoFullTypeName(capnp_type)); + + const auto * tuple_data_type = assert_cast(data_type.get()); + auto nested_types = tuple_data_type->getElements(); + Names nested_names; + bool have_explicit_names = tuple_data_type->haveExplicitNames(); + auto structure_fields = struct_schema.getFields(); + if (!have_explicit_names) + { + if (nested_types.size() != structure_fields.size()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\" with type {} to CapnProto type {}: Tuple and Struct have different sizes {} != {}", + column_name, + data_type->getName(), + getCapnProtoFullTypeName(capnp_type), + nested_types.size(), + structure_fields.size()); + nested_names.reserve(structure_fields.size()); + for (auto field : structure_fields) + nested_names.push_back(field.getProto().getName()); + } + else + { + nested_names = tuple_data_type->getElementNames(); + } + + try + { + initialize(nested_types, nested_names, struct_schema, settings); + } + catch (Exception & e) + { + e.addMessage("(while converting column {})", column_name); + throw e; + } + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder * builder, size_t row_num) override + { + assert(builder); + auto & struct_builder = assert_cast(*builder); + if (auto tuple_column = typeid_cast(column.get())) + writeRow(tuple_column->getColumnsCopy(), struct_builder, row_num); + else + writeRow(Columns{column}, struct_builder, row_num); + return std::nullopt; + } + + void writeRow(const Columns & columns, StructBuilder & struct_builder, size_t row_num) + { + for (size_t i = 0; i != columns.size(); ++i) + { + const auto & field = fields[i]; + size_t field_index = field.getIndex(); + if (likely(!struct_builder.field_builders[field_index])) + struct_builder.field_builders[field_index] = initStructFieldBuilderIfNeeded( + columns[i], row_num, struct_builder.impl, field, fields_types[i], nested_field_sizes[i]); + + auto value = field_serializers[i]->writeRow(columns[i], struct_builder.field_builders[field_index].get(), row_num); + if (value) + struct_builder.impl.set(field, *value); + } + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto struct_reader = reader.as(); + if (auto * tuple_column = typeid_cast(&column)) + { + for (size_t i = 0; i != tuple_column->tupleSize(); ++i) + field_serializers[i]->readRow(tuple_column->getColumn(i), struct_reader.get(fields[i])); + } + else + field_serializers[0]->readRow(column, struct_reader.get(fields[0])); + } + + void readRow(MutableColumns & columns, const capnp::DynamicStruct::Reader & reader) + { + for (size_t i = 0; i != columns.size(); ++i) + field_serializers[i]->readRow(*columns[i], reader.get(fields[i])); + } + + private: + void initialize(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) + { + field_serializers.reserve(data_types.size()); + fields.reserve(data_types.size()); + fields_types.reserve(data_types.size()); + nested_field_sizes.reserve(data_types.size()); + for (size_t i = 0; i != data_types.size(); ++i) + { + auto [field_name, _] = splitFieldName(names[i]); + auto field = findFieldByName(schema, field_name); + if (!field) + throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto schema doesn't contain field with name {}", field_name); + + fields.push_back(*field); + auto capnp_type = field->getType(); + fields_types.push_back(capnp_type); + nested_field_sizes.push_back(capnp_type.isStruct() ? capnp_type.asStruct().getFields().size() : 0); + field_serializers.push_back(createSerializer(data_types[i], names[i], capnp_type, settings)); + } + } + + std::vector> field_serializers; + std::vector fields; + std::vector nested_field_sizes; + std::vector fields_types; + }; + + std::unique_ptr createSerializer(const DataTypePtr & type, const String & name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + auto [field_name, nested_name] = splitFieldName(name); + if (!nested_name.empty() && !capnp_type.isList()) + { + if (!capnp_type.isStruct()) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Field {} is not a struct", field_name); + + return std::make_unique(DataTypes{type}, Names{nested_name}, capnp_type.asStruct(), settings); + } + + switch (type->getTypeId()) + { + case TypeIndex::Int8: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::UInt8: + if (isBool(type)) + return createIntegerSerializer(type, name, capnp_type); + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::Int16: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::UInt16: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::Int32: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::UInt32: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::Int64: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::UInt64: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::Int128: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::UInt128: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Int256: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::UInt256: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Float32: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Float64: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Date: + return std::make_unique(type, name, capnp_type); + case TypeIndex::Date32: + return std::make_unique(type, name, capnp_type); + case TypeIndex::DateTime: + return std::make_unique(type, name, capnp_type); + case TypeIndex::DateTime64: + return std::make_unique(type, name, capnp_type); + case TypeIndex::Decimal32: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Decimal64: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Decimal128: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Decimal256: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::IPv4: + return std::make_unique(type, name, capnp_type); + case TypeIndex::IPv6: + return std::make_unique(type, name, capnp_type); + case TypeIndex::UUID: + return std::make_unique(type, name, capnp_type); + case TypeIndex::Enum8: + return std::make_unique>(type, name, capnp_type, settings.enum_comparing_mode); + case TypeIndex::Enum16: + return std::make_unique>(type, name, capnp_type, settings.enum_comparing_mode); + case TypeIndex::String: + if (capnp_type.isData()) + return std::make_unique>(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); + case TypeIndex::FixedString: + if (capnp_type.isData()) + return std::make_unique>(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); + case TypeIndex::LowCardinality: + return std::make_unique(type, name, capnp_type, settings); + case TypeIndex::Nullable: + return std::make_unique(type, name, capnp_type, settings); + case TypeIndex::Array: + return std::make_unique(type, name, capnp_type, settings); + case TypeIndex::Map: + return std::make_unique(type, name, capnp_type, settings); + case TypeIndex::Tuple: + return std::make_unique(type, name, capnp_type, settings); + default: + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Type {} is not supported in CapnProto format", type->getName()); + } + } +} + +class CapnProtoSerializer::Impl +{ +public: + Impl(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) + : struct_serializer(std::make_unique(data_types, names, schema, settings)) + , fields_size(schema.getFields().size()) + { + } + + void writeRow(const Columns & columns, capnp::DynamicStruct::Builder builder, size_t row_num) + { + StructBuilder struct_builder(std::move(builder), fields_size); + struct_serializer->writeRow(columns, struct_builder, row_num); + } + + void readRow(MutableColumns & columns, capnp::DynamicStruct::Reader & reader) + { + struct_serializer->readRow(columns, reader); + } + +private: + std::unique_ptr struct_serializer; + size_t fields_size; +}; + +CapnProtoSerializer::CapnProtoSerializer(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) + : serializer_impl(std::make_unique(data_types, names, schema, settings)) +{ +} + +void CapnProtoSerializer::writeRow(const Columns & columns, capnp::DynamicStruct::Builder builder, size_t row_num) +{ + serializer_impl->writeRow(columns, std::move(builder), row_num); +} + +void CapnProtoSerializer::readRow(MutableColumns & columns, capnp::DynamicStruct::Reader & reader) +{ + serializer_impl->readRow(columns, reader); +} + +CapnProtoSerializer::~CapnProtoSerializer() = default; + +} diff --git a/src/Formats/CapnProtoSerializer.h b/src/Formats/CapnProtoSerializer.h new file mode 100644 index 00000000000..efae797875b --- /dev/null +++ b/src/Formats/CapnProtoSerializer.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class CapnProtoSerializer +{ +public: + CapnProtoSerializer(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings); + + void writeRow(const Columns & columns, capnp::DynamicStruct::Builder builder, size_t row_num); + + void readRow(MutableColumns & columns, capnp::DynamicStruct::Reader & reader); + + ~CapnProtoSerializer(); + +private: + class Impl; + std::unique_ptr serializer_impl; +}; + +} diff --git a/src/Formats/CapnProtoUtils.cpp b/src/Formats/CapnProtoUtils.cpp deleted file mode 100644 index d6c032408bb..00000000000 --- a/src/Formats/CapnProtoUtils.cpp +++ /dev/null @@ -1,734 +0,0 @@ -#include - -#if USE_CAPNP - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int CANNOT_PARSE_CAPN_PROTO_SCHEMA; - extern const int THERE_IS_NO_COLUMN; - extern const int BAD_TYPE_OF_FIELD; - extern const int CAPN_PROTO_BAD_CAST; - extern const int FILE_DOESNT_EXIST; - extern const int UNKNOWN_EXCEPTION; - extern const int INCORRECT_DATA; - extern const int CAPN_PROTO_BAD_TYPE; - extern const int BAD_ARGUMENTS; -} - -std::pair splitCapnProtoFieldName(const String & name) -{ - const auto * begin = name.data(); - const auto * end = name.data() + name.size(); - const auto * it = find_first_symbols<'_', '.'>(begin, end); - String first = String(begin, it); - String second = it == end ? "" : String(it + 1, end); - return {first, second}; -} - -capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaInfo & schema_info) -{ - capnp::ParsedSchema schema; - try - { - int fd; - KJ_SYSCALL(fd = open(schema_info.schemaDirectory().data(), O_RDONLY)); // NOLINT(bugprone-suspicious-semicolon) - auto schema_dir = kj::newDiskDirectory(kj::OsFileHandle(fd)); - schema = impl.parseFromDirectory(*schema_dir, kj::Path::parse(schema_info.schemaPath()), {}); - } - catch (const kj::Exception & e) - { - /// That's not good to determine the type of error by its description, but - /// this is the only way to do it here, because kj doesn't specify the type of error. - auto description = std::string_view(e.getDescription().cStr()); - if (description.find("No such file or directory") != String::npos || description.find("no such directory") != String::npos) - throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot open CapnProto schema, file {} doesn't exists", schema_info.absoluteSchemaPath()); - - if (description.find("Parse error") != String::npos) - throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA, "Cannot parse CapnProto schema {}:{}", schema_info.schemaPath(), e.getLine()); - - throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, - "Unknown exception while parsing CapnProto schema: {}, schema dir and file: {}, {}", - description, schema_info.schemaDirectory(), schema_info.schemaPath()); - } - - auto message_maybe = schema.findNested(schema_info.messageName()); - auto * message_schema = kj::_::readMaybe(message_maybe); - if (!message_schema) - throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA, - "CapnProto schema doesn't contain message with name {}", schema_info.messageName()); - return message_schema->asStruct(); -} - -bool compareEnumNames(const String & first, const String & second, FormatSettings::EnumComparingMode mode) -{ - if (mode == FormatSettings::EnumComparingMode::BY_NAMES_CASE_INSENSITIVE) - return boost::algorithm::to_lower_copy(first) == boost::algorithm::to_lower_copy(second); - return first == second; -} - -static const std::map capnp_simple_type_names = -{ - {capnp::schema::Type::Which::BOOL, "Bool"}, - {capnp::schema::Type::Which::VOID, "Void"}, - {capnp::schema::Type::Which::INT8, "Int8"}, - {capnp::schema::Type::Which::INT16, "Int16"}, - {capnp::schema::Type::Which::INT32, "Int32"}, - {capnp::schema::Type::Which::INT64, "Int64"}, - {capnp::schema::Type::Which::UINT8, "UInt8"}, - {capnp::schema::Type::Which::UINT16, "UInt16"}, - {capnp::schema::Type::Which::UINT32, "UInt32"}, - {capnp::schema::Type::Which::UINT64, "UInt64"}, - {capnp::schema::Type::Which::FLOAT32, "Float32"}, - {capnp::schema::Type::Which::FLOAT64, "Float64"}, - {capnp::schema::Type::Which::TEXT, "Text"}, - {capnp::schema::Type::Which::DATA, "Data"}, - {capnp::schema::Type::Which::INTERFACE, "Interface"}, - {capnp::schema::Type::Which::ANY_POINTER, "AnyPointer"}, -}; - -static bool checkIfStructContainsUnnamedUnion(const capnp::StructSchema & struct_schema) -{ - return struct_schema.getFields().size() != struct_schema.getNonUnionFields().size(); -} - -static bool checkIfStructIsNamedUnion(const capnp::StructSchema & struct_schema) -{ - return struct_schema.getFields().size() == struct_schema.getUnionFields().size(); -} - -/// Get full name of type for better exception messages. -static String getCapnProtoFullTypeName(const capnp::Type & type) -{ - switch (type.which()) - { - case capnp::schema::Type::Which::STRUCT: - { - auto struct_schema = type.asStruct(); - - auto non_union_fields = struct_schema.getNonUnionFields(); - std::vector non_union_field_names; - for (auto nested_field : non_union_fields) - non_union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType())); - - auto union_fields = struct_schema.getUnionFields(); - std::vector union_field_names; - for (auto nested_field : union_fields) - union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType())); - - String union_name = "Union(" + boost::algorithm::join(union_field_names, ", ") + ")"; - /// Check if the struct is a named union. - if (non_union_field_names.empty()) - return union_name; - - String type_name = "Struct(" + boost::algorithm::join(non_union_field_names, ", "); - /// Check if the struct contains unnamed union. - if (!union_field_names.empty()) - type_name += ", " + union_name; - type_name += ")"; - return type_name; - } - case capnp::schema::Type::Which::LIST: - return "List(" + getCapnProtoFullTypeName(type.asList().getElementType()) + ")"; - case capnp::schema::Type::Which::ENUM: - { - auto enum_schema = type.asEnum(); - String enum_name = "Enum("; - auto enumerants = enum_schema.getEnumerants(); - for (unsigned i = 0; i != enumerants.size(); ++i) - { - enum_name += String(enumerants[i].getProto().getName()) + " = " + std::to_string(enumerants[i].getOrdinal()); - if (i + 1 != enumerants.size()) - enum_name += ", "; - } - enum_name += ")"; - return enum_name; - } - default: - auto it = capnp_simple_type_names.find(type.which()); - if (it == capnp_simple_type_names.end()) - throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unknown CapnProto type"); - return it->second; - } -} - -template -static bool checkEnums(const capnp::Type & capnp_type, const DataTypePtr column_type, FormatSettings::EnumComparingMode mode, UInt64 max_value, String & error_message) -{ - if (!capnp_type.isEnum()) - return false; - - auto enum_schema = capnp_type.asEnum(); - bool to_lower = mode == FormatSettings::EnumComparingMode::BY_NAMES_CASE_INSENSITIVE; - const auto * enum_type = assert_cast *>(column_type.get()); - const auto & enum_values = dynamic_cast &>(*enum_type); - - auto enumerants = enum_schema.getEnumerants(); - if (mode == FormatSettings::EnumComparingMode::BY_VALUES) - { - /// In CapnProto Enum fields are numbered sequentially starting from zero. - if (enumerants.size() > max_value) - { - error_message += "Enum from CapnProto schema contains values that is out of range for Clickhouse Enum"; - return false; - } - - auto values = enum_values.getSetOfAllValues(); - std::unordered_set capn_enum_values; - for (auto enumerant : enumerants) - capn_enum_values.insert(Type(enumerant.getOrdinal())); - auto result = values == capn_enum_values; - if (!result) - error_message += "The set of values in Enum from CapnProto schema is different from the set of values in ClickHouse Enum"; - return result; - } - - auto names = enum_values.getSetOfAllNames(to_lower); - std::unordered_set capn_enum_names; - - for (auto enumerant : enumerants) - { - String name = enumerant.getProto().getName(); - capn_enum_names.insert(to_lower ? boost::algorithm::to_lower_copy(name) : name); - } - - auto result = names == capn_enum_names; - if (!result) - error_message += "The set of names in Enum from CapnProto schema is different from the set of names in ClickHouse Enum"; - return result; -} - -static bool checkCapnProtoType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message, const String & column_name); - -static bool checkNullableType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message, const String & column_name) -{ - if (!capnp_type.isStruct()) - return false; - - /// Check that struct is a named union of type VOID and one arbitrary type. - auto struct_schema = capnp_type.asStruct(); - if (!checkIfStructIsNamedUnion(struct_schema)) - return false; - - auto union_fields = struct_schema.getUnionFields(); - if (union_fields.size() != 2) - return false; - - auto first = union_fields[0]; - auto second = union_fields[1]; - - auto nested_type = assert_cast(data_type.get())->getNestedType(); - if (first.getType().isVoid()) - return checkCapnProtoType(second.getType(), nested_type, mode, error_message, column_name); - if (second.getType().isVoid()) - return checkCapnProtoType(first.getType(), nested_type, mode, error_message, column_name); - return false; -} - -static bool checkTupleType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message) -{ - if (!capnp_type.isStruct()) - return false; - auto struct_schema = capnp_type.asStruct(); - - if (checkIfStructIsNamedUnion(struct_schema)) - return false; - - if (checkIfStructContainsUnnamedUnion(struct_schema)) - { - error_message += "CapnProto struct contains unnamed union"; - return false; - } - - const auto * tuple_data_type = assert_cast(data_type.get()); - auto nested_types = tuple_data_type->getElements(); - if (nested_types.size() != struct_schema.getFields().size()) - { - error_message += "Tuple and Struct types have different sizes"; - return false; - } - - bool have_explicit_names = tuple_data_type->haveExplicitNames(); - const auto & nested_names = tuple_data_type->getElementNames(); - for (uint32_t i = 0; i != nested_names.size(); ++i) - { - if (have_explicit_names) - { - KJ_IF_MAYBE (field, struct_schema.findFieldByName(nested_names[i])) - { - if (!checkCapnProtoType(field->getType(), nested_types[tuple_data_type->getPositionByName(nested_names[i])], mode, error_message, nested_names[i])) - return false; - } - else - { - error_message += "CapnProto struct doesn't contain a field with name " + nested_names[i]; - return false; - } - } - else if (!checkCapnProtoType(struct_schema.getFields()[i].getType(), nested_types[tuple_data_type->getPositionByName(nested_names[i])], mode, error_message, nested_names[i])) - return false; - } - - return true; -} - -static bool checkArrayType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message, const String & column_name) -{ - if (!capnp_type.isList()) - return false; - auto list_schema = capnp_type.asList(); - auto nested_type = assert_cast(data_type.get())->getNestedType(); - - auto [field_name, nested_name] = splitCapnProtoFieldName(column_name); - if (!nested_name.empty() && list_schema.getElementType().isStruct()) - { - auto struct_schema = list_schema.getElementType().asStruct(); - KJ_IF_MAYBE(field, struct_schema.findFieldByName(nested_name)) - return checkCapnProtoType(field->getType(), nested_type, mode, error_message, nested_name); - - error_message += "Element type of List {} doesn't contain field with name " + nested_name; - return false; - } - - return checkCapnProtoType(list_schema.getElementType(), nested_type, mode, error_message, column_name); -} - -static bool checkMapType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message) -{ - /// We output/input Map type as follow CapnProto schema - /// - /// struct Map { - /// struct Entry { - /// key @0: Key; - /// value @1: Value; - /// } - /// entries @0 :List(Entry); - /// } - - if (!capnp_type.isStruct()) - return false; - auto struct_schema = capnp_type.asStruct(); - - if (checkIfStructContainsUnnamedUnion(struct_schema)) - { - error_message += "CapnProto struct contains unnamed union"; - return false; - } - - if (struct_schema.getFields().size() != 1) - { - error_message += "CapnProto struct that represents Map type can contain only one field"; - return false; - } - - const auto & field_type = struct_schema.getFields()[0].getType(); - if (!field_type.isList()) - { - error_message += "Field of CapnProto struct that represents Map is not a list"; - return false; - } - - auto list_element_type = field_type.asList().getElementType(); - if (!list_element_type.isStruct()) - { - error_message += "Field of CapnProto struct that represents Map is not a list of structs"; - return false; - } - - auto key_value_struct = list_element_type.asStruct(); - if (checkIfStructContainsUnnamedUnion(key_value_struct)) - { - error_message += "CapnProto struct contains unnamed union"; - return false; - } - - if (key_value_struct.getFields().size() != 2) - { - error_message += "Key-value structure for Map struct should have exactly 2 fields"; - return false; - } - - const auto & map_type = assert_cast(*data_type); - DataTypes types = {map_type.getKeyType(), map_type.getValueType()}; - Names names = {"key", "value"}; - - for (size_t i = 0; i != types.size(); ++i) - { - KJ_IF_MAYBE(field, key_value_struct.findFieldByName(names[i])) - { - if (!checkCapnProtoType(field->getType(), types[i], mode, error_message, names[i])) - return false; - } - else - { - error_message += R"(Key-value structure for Map struct should have exactly 2 fields with names "key" and "value")"; - return false; - } - } - - return true; -} - -static bool isCapnInteger(const capnp::Type & capnp_type) -{ - return capnp_type.isInt8() || capnp_type.isUInt8() || capnp_type.isInt16() || capnp_type.isUInt16() || capnp_type.isInt32() - || capnp_type.isUInt32() || capnp_type.isInt64() || capnp_type.isUInt64(); -} - -static bool checkCapnProtoType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message, const String & column_name) -{ - switch (data_type->getTypeId()) - { - case TypeIndex::UInt8: - return capnp_type.isBool() || isCapnInteger(capnp_type); - case TypeIndex::Int8: [[fallthrough]]; - case TypeIndex::Int16: [[fallthrough]]; - case TypeIndex::UInt16: [[fallthrough]]; - case TypeIndex::Int32: [[fallthrough]]; - case TypeIndex::UInt32: [[fallthrough]]; - case TypeIndex::Int64: [[fallthrough]]; - case TypeIndex::UInt64: - /// Allow integer conversions durin input/output. - return isCapnInteger(capnp_type); - case TypeIndex::Date: - return capnp_type.isUInt16(); - case TypeIndex::DateTime: [[fallthrough]]; - case TypeIndex::IPv4: - return capnp_type.isUInt32(); - case TypeIndex::Date32: [[fallthrough]]; - case TypeIndex::Decimal32: - return capnp_type.isInt32() || capnp_type.isUInt32(); - case TypeIndex::DateTime64: [[fallthrough]]; - case TypeIndex::Decimal64: - return capnp_type.isInt64() || capnp_type.isUInt64(); - case TypeIndex::Float32:[[fallthrough]]; - case TypeIndex::Float64: - /// Allow converting between Float32 and isFloat64 - return capnp_type.isFloat32() || capnp_type.isFloat64(); - case TypeIndex::Enum8: - return checkEnums(capnp_type, data_type, mode, INT8_MAX, error_message); - case TypeIndex::Enum16: - return checkEnums(capnp_type, data_type, mode, INT16_MAX, error_message); - case TypeIndex::Int128: [[fallthrough]]; - case TypeIndex::UInt128: [[fallthrough]]; - case TypeIndex::Int256: [[fallthrough]]; - case TypeIndex::UInt256: [[fallthrough]]; - case TypeIndex::Decimal128: [[fallthrough]]; - case TypeIndex::Decimal256: - return capnp_type.isData(); - case TypeIndex::Tuple: - return checkTupleType(capnp_type, data_type, mode, error_message); - case TypeIndex::Nullable: - { - auto result = checkNullableType(capnp_type, data_type, mode, error_message, column_name); - if (!result) - error_message += "Nullable can be represented only as a named union of type Void and nested type"; - return result; - } - case TypeIndex::Array: - return checkArrayType(capnp_type, data_type, mode, error_message, column_name); - case TypeIndex::LowCardinality: - return checkCapnProtoType(capnp_type, assert_cast(data_type.get())->getDictionaryType(), mode, error_message, column_name); - case TypeIndex::FixedString: [[fallthrough]]; - case TypeIndex::IPv6: [[fallthrough]]; - case TypeIndex::String: - return capnp_type.isText() || capnp_type.isData(); - case TypeIndex::Map: - return checkMapType(capnp_type, data_type, mode, error_message); - default: - return false; - } -} - -capnp::DynamicValue::Reader getReaderByColumnName(const capnp::DynamicStruct::Reader & struct_reader, const String & name) -{ - auto [field_name, nested_name] = splitCapnProtoFieldName(name); - KJ_IF_MAYBE(field, struct_reader.getSchema().findFieldByName(field_name)) - { - capnp::DynamicValue::Reader field_reader; - try - { - field_reader = struct_reader.get(*field); - } - catch (const kj::Exception & e) - { - throw Exception(ErrorCodes::INCORRECT_DATA, - "Cannot extract field value from struct by provided schema, error: " - "{} Perhaps the data was generated by another schema", String(e.getDescription().cStr())); - } - - if (nested_name.empty()) - return field_reader; - - /// Support reading Nested as List of Structs. - if (field_reader.getType() == capnp::DynamicValue::LIST) - { - auto list_schema = field->getType().asList(); - if (!list_schema.getElementType().isStruct()) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Element type of List {} is not a struct", field_name); - - auto struct_schema = list_schema.getElementType().asStruct(); - KJ_IF_MAYBE(nested_field, struct_schema.findFieldByName(nested_name)) - return field_reader; - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Element type of List {} doesn't contain field with name \"{}\"", field_name, nested_name); - } - - if (field_reader.getType() != capnp::DynamicValue::STRUCT) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Field {} is not a struct", field_name); - - return getReaderByColumnName(field_reader.as(), nested_name); - } - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto struct doesn't contain field with name {}", field_name); -} - -std::pair getStructBuilderAndFieldByColumnName(capnp::DynamicStruct::Builder struct_builder, const String & name) -{ - auto [field_name, nested_name] = splitCapnProtoFieldName(name); - KJ_IF_MAYBE(field, struct_builder.getSchema().findFieldByName(field_name)) - { - if (nested_name.empty()) - return {struct_builder, *field}; - - auto field_builder = struct_builder.get(*field); - - /// Support reading Nested as List of Structs. - if (field_builder.getType() == capnp::DynamicValue::LIST) - { - auto list_schema = field->getType().asList(); - if (!list_schema.getElementType().isStruct()) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Element type of List {} is not a struct", field_name); - - auto struct_schema = list_schema.getElementType().asStruct(); - KJ_IF_MAYBE(nested_field, struct_schema.findFieldByName(nested_name)) - return {struct_builder, *field}; - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Element type of List {} doesn't contain field with name \"{}\"", field_name, nested_name); - } - - if (field_builder.getType() != capnp::DynamicValue::STRUCT) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Field {} is not a struct", field_name); - - return getStructBuilderAndFieldByColumnName(field_builder.as(), nested_name); - } - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto struct doesn't contain field with name {}", field_name); -} - -static std::pair getFieldByName(const capnp::StructSchema & schema, const String & name) -{ - auto [field_name, nested_name] = splitCapnProtoFieldName(name); - KJ_IF_MAYBE(field, schema.findFieldByName(field_name)) - { - if (nested_name.empty()) - return {*field, name}; - - /// Support reading Nested as List of Structs. - if (field->getType().isList()) - { - auto list_schema = field->getType().asList(); - if (!list_schema.getElementType().isStruct()) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Element type of List {} is not a struct", field_name); - - auto struct_schema = list_schema.getElementType().asStruct(); - KJ_IF_MAYBE(nested_field, struct_schema.findFieldByName(nested_name)) - return {*field, name}; - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Element type of List {} doesn't contain field with name \"{}\"", field_name, nested_name); - } - - if (!field->getType().isStruct()) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Field {} is not a struct", field_name); - - return getFieldByName(field->getType().asStruct(), nested_name); - } - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto schema doesn't contain field with name {}", field_name); -} - -void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Block & header, FormatSettings::EnumComparingMode mode) -{ - /// Firstly check that struct doesn't contain unnamed union, because we don't support it. - if (checkIfStructContainsUnnamedUnion(schema)) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Schema contains unnamed union that is not supported"); - auto names_and_types = header.getNamesAndTypesList(); - String additional_error_message; - for (auto & [name, type] : names_and_types) - { - auto [field, field_name] = getFieldByName(schema, name); - if (!checkCapnProtoType(field.getType(), type, mode, additional_error_message, field_name)) - { - auto e = Exception( - ErrorCodes::CAPN_PROTO_BAD_CAST, - "Cannot convert ClickHouse type {} to CapnProto type {}", - type->getName(), - getCapnProtoFullTypeName(field.getType())); - if (!additional_error_message.empty()) - e.addMessage(additional_error_message); - throw std::move(e); - } - } -} - -template -static DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants) -{ - std::vector> values; - for (auto enumerant : enumerants) - values.emplace_back(enumerant.getProto().getName(), ValueType(enumerant.getOrdinal())); - return std::make_shared>(std::move(values)); -} - -static DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema) -{ - auto enumerants = enum_schema.getEnumerants(); - if (enumerants.size() < 128) - return getEnumDataTypeFromEnumerants(enumerants); - if (enumerants.size() < 32768) - return getEnumDataTypeFromEnumerants(enumerants); - - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "ClickHouse supports only 8 and 16-bit Enums"); -} - -static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type, bool skip_unsupported_fields) -{ - switch (capnp_type.which()) - { - case capnp::schema::Type::INT8: - return std::make_shared(); - case capnp::schema::Type::INT16: - return std::make_shared(); - case capnp::schema::Type::INT32: - return std::make_shared(); - case capnp::schema::Type::INT64: - return std::make_shared(); - case capnp::schema::Type::BOOL: [[fallthrough]]; - case capnp::schema::Type::UINT8: - return std::make_shared(); - case capnp::schema::Type::UINT16: - return std::make_shared(); - case capnp::schema::Type::UINT32: - return std::make_shared(); - case capnp::schema::Type::UINT64: - return std::make_shared(); - case capnp::schema::Type::FLOAT32: - return std::make_shared(); - case capnp::schema::Type::FLOAT64: - return std::make_shared(); - case capnp::schema::Type::DATA: [[fallthrough]]; - case capnp::schema::Type::TEXT: - return std::make_shared(); - case capnp::schema::Type::ENUM: - return getEnumDataTypeFromEnumSchema(capnp_type.asEnum()); - case capnp::schema::Type::LIST: - { - auto list_schema = capnp_type.asList(); - auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType(), skip_unsupported_fields); - if (!nested_type) - return nullptr; - return std::make_shared(nested_type); - } - case capnp::schema::Type::STRUCT: - { - auto struct_schema = capnp_type.asStruct(); - - - if (struct_schema.getFields().size() == 0) - { - if (skip_unsupported_fields) - return nullptr; - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Empty messages are not supported"); - } - - /// Check if it can be Nullable. - if (checkIfStructIsNamedUnion(struct_schema)) - { - auto fields = struct_schema.getUnionFields(); - if (fields.size() != 2 || (!fields[0].getType().isVoid() && !fields[1].getType().isVoid())) - { - if (skip_unsupported_fields) - return nullptr; - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unions are not supported"); - } - auto value_type = fields[0].getType().isVoid() ? fields[1].getType() : fields[0].getType(); - if (value_type.isStruct() || value_type.isList()) - { - if (skip_unsupported_fields) - return nullptr; - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Tuples and Lists cannot be inside Nullable"); - } - - auto nested_type = getDataTypeFromCapnProtoType(value_type, skip_unsupported_fields); - if (!nested_type) - return nullptr; - return std::make_shared(nested_type); - } - - if (checkIfStructContainsUnnamedUnion(struct_schema)) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); - - /// Treat Struct as Tuple. - DataTypes nested_types; - Names nested_names; - for (auto field : struct_schema.getNonUnionFields()) - { - auto nested_type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields); - if (!nested_type) - continue; - nested_names.push_back(field.getProto().getName()); - nested_types.push_back(nested_type); - } - if (nested_types.empty()) - return nullptr; - return std::make_shared(std::move(nested_types), std::move(nested_names)); - } - default: - { - if (skip_unsupported_fields) - return nullptr; - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unsupported CapnProtoType: {}", getCapnProtoFullTypeName(capnp_type)); - } - } -} - -NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields) -{ - if (checkIfStructContainsUnnamedUnion(schema)) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); - - NamesAndTypesList names_and_types; - for (auto field : schema.getNonUnionFields()) - { - auto name = field.getProto().getName(); - auto type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields); - if (type) - names_and_types.emplace_back(name, type); - } - if (names_and_types.empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Cannot convert CapnProto schema to ClickHouse table schema, all fields have unsupported types"); - - return names_and_types; -} - -} - -#endif diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 3a2e818d540..384c6a725dc 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -325,16 +325,16 @@ struct FormatSettings /// For capnProto format we should determine how to /// compare ClickHouse Enum and Enum from schema. - enum class EnumComparingMode + enum class CapnProtoEnumComparingMode { BY_NAMES, // Names in enums should be the same, values can be different. BY_NAMES_CASE_INSENSITIVE, // Case-insensitive name comparison. BY_VALUES, // Values should be the same, names can be different. }; - struct + struct CapnProto { - EnumComparingMode enum_comparing_mode = EnumComparingMode::BY_VALUES; + CapnProtoEnumComparingMode enum_comparing_mode = CapnProtoEnumComparingMode::BY_VALUES; bool skip_fields_with_unsupported_types_in_schema_inference = false; } capn_proto; diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp index 2f84e9bde3c..e686ae86997 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp @@ -9,23 +9,6 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - namespace DB { @@ -35,16 +18,14 @@ namespace ErrorCodes extern const int INCORRECT_DATA; } -CapnProtoRowInputFormat::CapnProtoRowInputFormat(ReadBuffer & in_, Block header, Params params_, const FormatSchemaInfo & info, const FormatSettings & format_settings_) - : IRowInputFormat(std::move(header), in_, std::move(params_)) +CapnProtoRowInputFormat::CapnProtoRowInputFormat(ReadBuffer & in_, Block header_, Params params_, const FormatSchemaInfo & info, const FormatSettings & format_settings) + : IRowInputFormat(std::move(header_), in_, std::move(params_)) , parser(std::make_shared()) - , format_settings(format_settings_) - , column_types(getPort().getHeader().getDataTypes()) - , column_names(getPort().getHeader().getNames()) { // Parse the schema and fetch the root object - root = parser->getMessageSchema(info); - checkCapnProtoSchemaStructure(root, getPort().getHeader(), format_settings.capn_proto.enum_comparing_mode); + schema = parser->getMessageSchema(info); + const auto & header = getPort().getHeader(); + serializer = std::make_unique(header.getDataTypes(), header.getNames(), schema, format_settings.capn_proto); } kj::Array CapnProtoRowInputFormat::readMessage() @@ -82,213 +63,6 @@ kj::Array CapnProtoRowInputFormat::readMessage() return msg; } -static void insertInteger(IColumn & column, const DataTypePtr & column_type, UInt64 value) -{ - switch (column_type->getTypeId()) - { - case TypeIndex::Int8: - assert_cast(column).insertValue(value); - break; - case TypeIndex::UInt8: - assert_cast(column).insertValue(value); - break; - case TypeIndex::Int16: - assert_cast(column).insertValue(value); - break; - case TypeIndex::Date: [[fallthrough]]; - case TypeIndex::UInt16: - assert_cast(column).insertValue(value); - break; - case TypeIndex::Int32: - assert_cast(column).insertValue(static_cast(value)); - break; - case TypeIndex::DateTime: [[fallthrough]]; - case TypeIndex::UInt32: - assert_cast(column).insertValue(static_cast(value)); - break; - case TypeIndex::IPv4: - assert_cast(column).insertValue(IPv4(static_cast(value))); - break; - case TypeIndex::Int64: - assert_cast(column).insertValue(value); - break; - case TypeIndex::UInt64: - assert_cast(column).insertValue(value); - break; - case TypeIndex::DateTime64: - assert_cast &>(column).insertValue(value); - break; - case TypeIndex::Decimal32: - assert_cast &>(column).insertValue(static_cast(value)); - break; - case TypeIndex::Decimal64: - assert_cast &>(column).insertValue(value); - break; - default: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Column type {} cannot be parsed from integer", column_type->getName()); - } -} - -static void insertFloat(IColumn & column, const DataTypePtr & column_type, Float64 value) -{ - switch (column_type->getTypeId()) - { - case TypeIndex::Float32: - assert_cast(column).insertValue(static_cast(value)); - break; - case TypeIndex::Float64: - assert_cast(column).insertValue(value); - break; - default: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Column type is not a float."); - } -} - -template -static void insertData(IColumn & column, const DataTypePtr & column_type, Value value) -{ - if (column_type->haveMaximumSizeOfValue() && value.size() != column_type->getSizeOfValueInMemory()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", column_type->getName(), value.size()); - - column.insertData(reinterpret_cast(value.begin()), value.size()); -} - -template -static void insertEnum(IColumn & column, const DataTypePtr & column_type, const capnp::DynamicEnum & enum_value, FormatSettings::EnumComparingMode enum_comparing_mode) -{ - auto enumerant = *kj::_::readMaybe(enum_value.getEnumerant()); - auto enum_type = assert_cast *>(column_type.get()); - DataTypePtr nested_type = std::make_shared>(); - switch (enum_comparing_mode) - { - case FormatSettings::EnumComparingMode::BY_VALUES: - insertInteger(column, nested_type, Int64(enumerant.getOrdinal())); - return; - case FormatSettings::EnumComparingMode::BY_NAMES: - insertInteger(column, nested_type, Int64(enum_type->getValue(String(enumerant.getProto().getName())))); - return; - case FormatSettings::EnumComparingMode::BY_NAMES_CASE_INSENSITIVE: - { - /// Find the same enum name case insensitive. - String enum_name = enumerant.getProto().getName(); - for (auto & name : enum_type->getAllRegisteredNames()) - { - if (compareEnumNames(name, enum_name, enum_comparing_mode)) - { - insertInteger(column, nested_type, Int64(enum_type->getValue(name))); - break; - } - } - } - } -} - -static void insertValue(IColumn & column, const DataTypePtr & column_type, const String & column_name, const capnp::DynamicValue::Reader & value, FormatSettings::EnumComparingMode enum_comparing_mode) -{ - if (column_type->lowCardinality()) - { - auto & lc_column = assert_cast(column); - auto tmp_column = lc_column.getDictionary().getNestedColumn()->cloneEmpty(); - auto dict_type = assert_cast(column_type.get())->getDictionaryType(); - insertValue(*tmp_column, dict_type, column_name, value, enum_comparing_mode); - lc_column.insertFromFullColumn(*tmp_column, 0); - return; - } - - switch (value.getType()) - { - case capnp::DynamicValue::Type::INT: - insertInteger(column, column_type, value.as()); - break; - case capnp::DynamicValue::Type::UINT: - insertInteger(column, column_type, value.as()); - break; - case capnp::DynamicValue::Type::FLOAT: - insertFloat(column, column_type, value.as()); - break; - case capnp::DynamicValue::Type::BOOL: - insertInteger(column, column_type, UInt64(value.as())); - break; - case capnp::DynamicValue::Type::DATA: - insertData(column, column_type, value.as()); - break; - case capnp::DynamicValue::Type::TEXT: - insertData(column, column_type, value.as()); - break; - case capnp::DynamicValue::Type::ENUM: - if (column_type->getTypeId() == TypeIndex::Enum8) - insertEnum(column, column_type, value.as(), enum_comparing_mode); - else - insertEnum(column, column_type, value.as(), enum_comparing_mode); - break; - case capnp::DynamicValue::LIST: - { - auto list_value = value.as(); - auto & column_array = assert_cast(column); - auto & offsets = column_array.getOffsets(); - offsets.push_back(offsets.back() + list_value.size()); - - auto & nested_column = column_array.getData(); - auto nested_type = assert_cast(column_type.get())->getNestedType(); - for (const auto & nested_value : list_value) - insertValue(nested_column, nested_type, column_name, nested_value, enum_comparing_mode); - break; - } - case capnp::DynamicValue::Type::STRUCT: - { - auto struct_value = value.as(); - if (column_type->isNullable()) - { - auto & nullable_column = assert_cast(column); - auto field = *kj::_::readMaybe(struct_value.which()); - if (field.getType().isVoid()) - nullable_column.insertDefault(); - else - { - auto & nested_column = nullable_column.getNestedColumn(); - auto nested_type = assert_cast(column_type.get())->getNestedType(); - auto nested_value = struct_value.get(field); - insertValue(nested_column, nested_type, column_name, nested_value, enum_comparing_mode); - nullable_column.getNullMapData().push_back(0); - } - } - else if (isTuple(column_type)) - { - auto & tuple_column = assert_cast(column); - const auto * tuple_type = assert_cast(column_type.get()); - bool have_explicit_names = tuple_type->haveExplicitNames(); - auto struct_schema = struct_value.getSchema(); - for (uint32_t i = 0; i != tuple_column.tupleSize(); ++i) - insertValue( - tuple_column.getColumn(i), - tuple_type->getElements()[i], - tuple_type->getElementNames()[i], - struct_value.get(have_explicit_names ? struct_schema.getFieldByName(tuple_type->getElementNames()[i]) : struct_schema.getFields()[i]), - enum_comparing_mode); - } - else if (isMap(column_type)) - { - const auto & map_type = assert_cast(*column_type); - DataTypes key_value_types = {map_type.getKeyType(), map_type.getValueType()}; - Names key_value_names = {"key", "value"}; - auto entries_type = std::make_shared(std::make_shared(key_value_types, key_value_names)); - auto & entries_column = assert_cast(column).getNestedColumn(); - auto entries_field = struct_value.getSchema().getFields()[0]; - insertValue(entries_column, entries_type, column_name, struct_value.get(entries_field), enum_comparing_mode); - } - else - { - /// It can be nested column from Nested type. - auto [field_name, nested_name] = splitCapnProtoFieldName(column_name); - insertValue(column, column_type, nested_name, struct_value.get(nested_name), enum_comparing_mode); - } - break; - } - default: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected CapnProto value type."); - } -} - bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) { if (in->eof()) @@ -298,12 +72,8 @@ bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension { auto array = readMessage(); capnp::FlatArrayMessageReader msg(array); - auto root_reader = msg.getRoot(root); - for (size_t i = 0; i != columns.size(); ++i) - { - auto value = getReaderByColumnName(root_reader, column_names[i]); - insertValue(*columns[i], column_types[i], column_names[i], value, format_settings.capn_proto.enum_comparing_mode); - } + auto root_reader = msg.getRoot(schema); + serializer->readRow(columns, root_reader); } catch (const kj::Exception & e) { @@ -343,7 +113,14 @@ void registerInputFormatCapnProto(FormatFactory & factory) factory.markFormatSupportsSubsetOfColumns("CapnProto"); factory.registerFileExtension("capnp", "CapnProto"); factory.registerAdditionalInfoForSchemaCacheGetter( - "CapnProto", [](const FormatSettings & settings) { return fmt::format("format_schema={}", settings.schema.format_schema); }); + "CapnProto", + [](const FormatSettings & settings) + { + return fmt::format( + "format_schema={}, skip_fields_with_unsupported_types_in_schema_inference={}", + settings.schema.format_schema, + settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference); + }); } void registerCapnProtoSchemaReader(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h index cf23f22b643..06e94da123f 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h @@ -4,7 +4,8 @@ #if USE_CAPNP #include -#include +#include +#include #include #include @@ -33,10 +34,8 @@ private: kj::Array readMessage(); std::shared_ptr parser; - capnp::StructSchema root; - const FormatSettings format_settings; - DataTypes column_types; - Names column_names; + capnp::StructSchema schema; + std::unique_ptr serializer; }; class CapnProtoSchemaReader : public IExternalSchemaReader diff --git a/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp b/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp index 0225680b396..7dd18be27f4 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp @@ -1,28 +1,13 @@ #include #if USE_CAPNP -#include +#include #include +#include #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - namespace DB { @@ -45,252 +30,25 @@ CapnProtoRowOutputFormat::CapnProtoRowOutputFormat( WriteBuffer & out_, const Block & header_, const FormatSchemaInfo & info, - const FormatSettings & format_settings_) - : IRowOutputFormat(header_, out_), column_names(header_.getNames()), column_types(header_.getDataTypes()), output_stream(std::make_unique(out_)), format_settings(format_settings_) + const FormatSettings & format_settings) + : IRowOutputFormat(header_, out_) + , column_names(header_.getNames()) + , column_types(header_.getDataTypes()) + , output_stream(std::make_unique(out_)) { schema = schema_parser.getMessageSchema(info); - checkCapnProtoSchemaStructure(schema, getPort(PortKind::Main).getHeader(), format_settings.capn_proto.enum_comparing_mode); -} - -template -static capnp::DynamicEnum getDynamicEnum( - const ColumnPtr & column, - const DataTypePtr & data_type, - size_t row_num, - const capnp::EnumSchema & enum_schema, - FormatSettings::EnumComparingMode mode) -{ - const auto * enum_data_type = assert_cast *>(data_type.get()); - EnumValue enum_value = column->getInt(row_num); - if (mode == FormatSettings::EnumComparingMode::BY_VALUES) - return capnp::DynamicEnum(enum_schema, enum_value); - - auto enum_name = enum_data_type->getNameForValue(enum_value); - for (const auto enumerant : enum_schema.getEnumerants()) - { - if (compareEnumNames(String(enum_name), enumerant.getProto().getName(), mode)) - return capnp::DynamicEnum(enumerant); - } - - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot convert CLickHouse Enum value to CapnProto Enum"); -} - -static capnp::DynamicValue::Builder initStructFieldBuilder(const ColumnPtr & column, size_t row_num, capnp::DynamicStruct::Builder & struct_builder, capnp::StructSchema::Field field) -{ - if (const auto * array_column = checkAndGetColumn(*column)) - { - size_t size = array_column->getOffsets()[row_num] - array_column->getOffsets()[row_num - 1]; - return struct_builder.init(field, static_cast(size)); - } - - if (field.getType().isStruct()) - return struct_builder.init(field); - - return struct_builder.get(field); -} - -static std::optional convertToDynamicValue( - const ColumnPtr & column, - const DataTypePtr & data_type, - size_t row_num, - const String & column_name, - capnp::DynamicValue::Builder builder, - FormatSettings::EnumComparingMode enum_comparing_mode, - std::vector> & temporary_text_data_storage) -{ - /// Here we don't do any types validation, because we did it in CapnProtoRowOutputFormat constructor. - - if (data_type->lowCardinality()) - { - const auto * lc_column = assert_cast(column.get()); - const auto & dict_type = assert_cast(data_type.get())->getDictionaryType(); - size_t index = lc_column->getIndexAt(row_num); - return convertToDynamicValue(lc_column->getDictionary().getNestedColumn(), dict_type, index, column_name, builder, enum_comparing_mode, temporary_text_data_storage); - } - - switch (builder.getType()) - { - case capnp::DynamicValue::Type::INT: - return capnp::DynamicValue::Reader(column->getInt(row_num)); - case capnp::DynamicValue::Type::UINT: - { - /// IPv4 column doesn't support getUInt method. - if (isIPv4(data_type)) - return capnp::DynamicValue::Reader(assert_cast(column.get())->getElement(row_num)); - return capnp::DynamicValue::Reader(column->getUInt(row_num)); - } - case capnp::DynamicValue::Type::BOOL: - return capnp::DynamicValue::Reader(column->getBool(row_num)); - case capnp::DynamicValue::Type::FLOAT: - return capnp::DynamicValue::Reader(column->getFloat64(row_num)); - case capnp::DynamicValue::Type::ENUM: - { - auto enum_schema = builder.as().getSchema(); - if (data_type->getTypeId() == TypeIndex::Enum8) - return capnp::DynamicValue::Reader( - getDynamicEnum(column, data_type, row_num, enum_schema, enum_comparing_mode)); - return capnp::DynamicValue::Reader( - getDynamicEnum(column, data_type, row_num, enum_schema, enum_comparing_mode)); - } - case capnp::DynamicValue::Type::DATA: - { - auto data = column->getDataAt(row_num); - return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); - } - case capnp::DynamicValue::Type::TEXT: - { - /// In TEXT type data should be null-terminated, but ClickHouse String data could not be. - /// To make data null-terminated we should copy it to temporary String object, but - /// capnp::Text::Reader works only with pointer to the data and it's size, so we should - /// guarantee that new String object life time is longer than capnp::Text::Reader life time. - /// To do this we store new String object in a temporary storage, passed in this function - /// by reference. We use unique_ptr instead of just String to avoid pointers - /// invalidation on vector reallocation. - temporary_text_data_storage.push_back(std::make_unique(column->getDataAt(row_num))); - auto & data = temporary_text_data_storage.back(); - return capnp::DynamicValue::Reader(capnp::Text::Reader(data->data(), data->size())); - } - case capnp::DynamicValue::Type::STRUCT: - { - auto struct_builder = builder.as(); - auto nested_struct_schema = struct_builder.getSchema(); - /// Struct can represent Tuple, Nullable (named union with two fields) or single column when it contains one nested column. - if (data_type->isNullable()) - { - const auto * nullable_type = assert_cast(data_type.get()); - const auto * nullable_column = assert_cast(column.get()); - auto fields = nested_struct_schema.getUnionFields(); - if (nullable_column->isNullAt(row_num)) - { - auto null_field = fields[0].getType().isVoid() ? fields[0] : fields[1]; - struct_builder.set(null_field, capnp::Void()); - } - else - { - auto value_field = fields[0].getType().isVoid() ? fields[1] : fields[0]; - struct_builder.clear(value_field); - const auto & nested_column = nullable_column->getNestedColumnPtr(); - auto value_builder = initStructFieldBuilder(nested_column, row_num, struct_builder, value_field); - auto value = convertToDynamicValue(nested_column, nullable_type->getNestedType(), row_num, column_name, value_builder, enum_comparing_mode, temporary_text_data_storage); - if (value) - struct_builder.set(value_field, *value); - } - } - else if (isTuple(data_type)) - { - const auto * tuple_data_type = assert_cast(data_type.get()); - const auto & nested_types = tuple_data_type->getElements(); - const auto & nested_names = tuple_data_type->getElementNames(); - const auto & nested_columns = assert_cast(column.get())->getColumns(); - bool have_explicit_names = tuple_data_type->haveExplicitNames(); - for (uint32_t i = 0; i != nested_names.size(); ++i) - { - capnp::StructSchema::Field nested_field = have_explicit_names ? nested_struct_schema.getFieldByName(nested_names[i]) : nested_struct_schema.getFields()[i]; - auto field_builder = initStructFieldBuilder(nested_columns[i], row_num, struct_builder, nested_field); - auto value = convertToDynamicValue(nested_columns[i], nested_types[i], row_num, nested_names[i], field_builder, enum_comparing_mode, temporary_text_data_storage); - if (value) - struct_builder.set(nested_field, *value); - } - } - else if (isMap(data_type)) - { - /// We output Map type as follow CapnProto schema - /// - /// struct Map { - /// struct Entry { - /// key @0: Key; - /// value @1: Value; - /// } - /// entries @0 :List(Entry); - /// } - /// - /// And we don't need to check that struct have this form here because we checked it before. - const auto & map_type = assert_cast(*data_type); - DataTypes key_value_types = {map_type.getKeyType(), map_type.getValueType()}; - Names key_value_names = {"key", "value"}; - auto entries_type = std::make_shared(std::make_shared(key_value_types, key_value_names)); - - /// Nested column in Map is actually Array(Tuple), so we can output it according to "entries" field schema. - const auto & entries_column = assert_cast(column.get())->getNestedColumnPtr(); - - auto entries_field = nested_struct_schema.getFields()[0]; - auto field_builder = initStructFieldBuilder(entries_column, row_num, struct_builder, entries_field); - auto entries_value = convertToDynamicValue(entries_column, entries_type, row_num, column_name, field_builder, enum_comparing_mode, temporary_text_data_storage); - if (entries_value) - struct_builder.set(entries_field, *entries_value); - } - else - { - /// It can be nested column from Nested type. - auto [field_name, nested_name] = splitCapnProtoFieldName(column_name); - auto nested_field = nested_struct_schema.getFieldByName(nested_name); - auto field_builder = initStructFieldBuilder(column, row_num, struct_builder, nested_field); - auto value = convertToDynamicValue(column, data_type, row_num, nested_name, field_builder, enum_comparing_mode, temporary_text_data_storage); - if (value) - struct_builder.set(nested_field, *value); - } - return std::nullopt; - } - case capnp::DynamicValue::Type::LIST: - { - auto list_builder = builder.as(); - const auto * array_column = assert_cast(column.get()); - const auto & nested_column = array_column->getDataPtr(); - const auto & nested_type = assert_cast(data_type.get())->getNestedType(); - const auto & offsets = array_column->getOffsets(); - auto offset = offsets[row_num - 1]; - size_t size = offsets[row_num] - offset; - - const auto * nested_array_column = checkAndGetColumn(*nested_column); - for (unsigned i = 0; i != static_cast(size); ++i) - { - capnp::DynamicValue::Builder value_builder; - /// For nested arrays we need to initialize nested list builder. - if (nested_array_column) - { - const auto & nested_offset = nested_array_column->getOffsets(); - size_t nested_array_size = nested_offset[offset + i] - nested_offset[offset + i - 1]; - value_builder = list_builder.init(i, static_cast(nested_array_size)); - } - else - value_builder = list_builder[i]; - - auto value = convertToDynamicValue(nested_column, nested_type, offset + i, column_name, value_builder, enum_comparing_mode, temporary_text_data_storage); - if (value) - list_builder.set(i, *value); - } - return std::nullopt; - } - default: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected CapnProto type."); - } + const auto & header = getPort(PortKind::Main).getHeader(); + serializer = std::make_unique(header.getDataTypes(), header.getNames(), schema, format_settings.capn_proto); + capnp::MallocMessageBuilder message; } void CapnProtoRowOutputFormat::write(const Columns & columns, size_t row_num) { capnp::MallocMessageBuilder message; - /// Temporary storage for data that will be outputted in fields with CapnProto type TEXT. - /// See comment in convertToDynamicValue() for more details. - std::vector> temporary_text_data_storage; capnp::DynamicStruct::Builder root = message.initRoot(schema); - - /// Some columns can share same field builder. For example when we have - /// column with Nested type that was flattened into several columns. - std::unordered_map field_builders; - for (size_t i = 0; i != columns.size(); ++i) - { - auto [struct_builder, field] = getStructBuilderAndFieldByColumnName(root, column_names[i]); - if (!field_builders.contains(field.getIndex())) - { - auto field_builder = initStructFieldBuilder(columns[i], row_num, struct_builder, field); - field_builders[field.getIndex()] = field_builder; - } - auto value = convertToDynamicValue(columns[i], column_types[i], row_num, column_names[i], field_builders[field.getIndex()], format_settings.capn_proto.enum_comparing_mode, temporary_text_data_storage); - if (value) - struct_builder.set(field, *value); - } - + serializer->writeRow(columns, std::move(root), row_num); capnp::writeMessage(*output_stream, message); + } void registerOutputFormatCapnProto(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.h b/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.h index 5cc7099d4c7..dd9dcc6b340 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.h +++ b/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.h @@ -3,15 +3,17 @@ #include "config.h" #if USE_CAPNP -#include -#include -#include -#include -#include -#include +# include +# include +# include +# include +# include +# include +# include namespace DB { + class CapnProtoOutputStream : public kj::OutputStream { public: @@ -43,8 +45,9 @@ private: DataTypes column_types; capnp::StructSchema schema; std::unique_ptr output_stream; - const FormatSettings format_settings; CapnProtoSchemaParser schema_parser; + std::unique_ptr serializer; + }; } diff --git a/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp b/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp index 9777f2361a2..6098923a195 100644 --- a/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp @@ -88,7 +88,14 @@ void registerInputFormatProtobufList(FormatFactory & factory) }); factory.markFormatSupportsSubsetOfColumns("ProtobufList"); factory.registerAdditionalInfoForSchemaCacheGetter( - "ProtobufList", [](const FormatSettings & settings) { return fmt::format("format_schema={}", settings.schema.format_schema); }); + "ProtobufList", + [](const FormatSettings & settings) + { + return fmt::format( + "format_schema={}, skip_fields_with_unsupported_types_in_schema_inference={}", + settings.schema.format_schema, + settings.protobuf.skip_fields_with_unsupported_types_in_schema_inference); + }); } void registerProtobufListSchemaReader(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp index ee60501dba5..126f3673571 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp @@ -128,7 +128,14 @@ void registerProtobufSchemaReader(FormatFactory & factory) for (const auto & name : {"Protobuf", "ProtobufSingle"}) factory.registerAdditionalInfoForSchemaCacheGetter( - name, [](const FormatSettings & settings) { return fmt::format("format_schema={}", settings.schema.format_schema); }); + name, + [](const FormatSettings & settings) + { + return fmt::format( + "format_schema={}, skip_fields_with_unsupported_types_in_schema_inference={}", + settings.schema.format_schema, + settings.protobuf.skip_fields_with_unsupported_types_in_schema_inference); + }); } } diff --git a/tests/queries/0_stateless/02030_capnp_format.sh b/tests/queries/0_stateless/02030_capnp_format.sh index c15d6fe442e..625104fb590 100755 --- a/tests/queries/0_stateless/02030_capnp_format.sh +++ b/tests/queries/0_stateless/02030_capnp_format.sh @@ -96,8 +96,8 @@ $CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a_b U $CLICKHOUSE_CLIENT --query="SELECT number AS a_b, number + 1 AS a_c_d, number + 2 AS a_c_e_f FROM numbers(5) FORMAT CapnProto SETTINGS format_schema='$CLIENT_SCHEMADIR/02030_capnp_nested_tuples:Message'" > $CAPN_PROTO_FILE $CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a Tuple(b UInt64, c Tuple(d UInt64, e Tuple(f UInt64)))') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" -$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a Tuple(bb UInt64, c Tuple(d UInt64, e Tuple(f UInt64)))') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a Tuple(b UInt64, c Tuple(d UInt64, e Tuple(ff UInt64)))') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a Tuple(bb UInt64, c Tuple(d UInt64, e Tuple(f UInt64)))') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" 2>&1 | grep -F -q "THERE_IS_NO_COLUMN" && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a Tuple(b UInt64, c Tuple(d UInt64, e Tuple(ff UInt64)))') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" 2>&1 | grep -F -q "THERE_IS_NO_COLUMN" && echo 'OK' || echo 'FAIL'; $CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'string String') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_simple_types:Message'" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL'; diff --git a/tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.reference b/tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.reference new file mode 100644 index 00000000000..f34c857e2f6 --- /dev/null +++ b/tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.reference @@ -0,0 +1 @@ +42 (42,42) diff --git a/tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.sh b/tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.sh new file mode 100755 index 00000000000..c3835948437 --- /dev/null +++ b/tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-parallel, no-replicated-database + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +SCHEMADIR=$CURDIR/format_schemas +$CLICKHOUSE_LOCAL -q "select 42 as Field1, (42, 42)::Tuple(Field1 UInt32, Field2 UInt32) as Nested format CapnProto settings format_schema='$SCHEMADIR/02735_case_insensitive_names_matching:Message'" | $CLICKHOUSE_LOCAL --input-format CapnProto --structure "Field1 UInt32, Nested Tuple(Field1 UInt32, Field2 UInt32)" -q "select * from table" --format_schema="$SCHEMADIR/02735_case_insensitive_names_matching:Message" + diff --git a/tests/queries/0_stateless/02736_reading_and_writing_structure_fields.reference b/tests/queries/0_stateless/02736_reading_and_writing_structure_fields.reference new file mode 100644 index 00000000000..b6e6d485929 --- /dev/null +++ b/tests/queries/0_stateless/02736_reading_and_writing_structure_fields.reference @@ -0,0 +1,3 @@ +(42,(42,42),[(42,42),(24,24)]) [(42,(42,42),[(42,42),(24,24)]),(24,(24,24),[(24,24),(42,42)])] +42 42 42 +[42,24] [42,24] [42,24] [[42,24],[24,42]] [[42,24],[24,42]] diff --git a/tests/queries/0_stateless/02736_reading_and_writing_structure_fields.sh b/tests/queries/0_stateless/02736_reading_and_writing_structure_fields.sh new file mode 100755 index 00000000000..c669be2ed33 --- /dev/null +++ b/tests/queries/0_stateless/02736_reading_and_writing_structure_fields.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-parallel, no-replicated-database + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +SCHEMADIR=$CURDIR/format_schemas +DATA_FILE=02736_$CLICKHOUSE_TEST_UNIQUE_NAME.bin + +$CLICKHOUSE_LOCAL -q "select tuple(42, tuple(42, 42), [tuple(42, 42), tuple(24, 24)]) as nested, [tuple(42, tuple(42, 42), [tuple(42, 42), tuple(24, 24)]), tuple(24, tuple(24, 24), [tuple(24, 24), tuple(42, 42)])] as nestedList format CapnProto settings format_schema='$SCHEMADIR/02736_nested_structures:Message'" > $DATA_FILE + +$CLICKHOUSE_LOCAL -q "select * from file($DATA_FILE, CapnProto) settings format_schema='$SCHEMADIR/02736_nested_structures:Message'" + +$CLICKHOUSE_LOCAL -q "select 42 as nested_field1, 42 as nested_nested_field1, 42 as nested_nested_field2 format CapnProto settings format_schema='$SCHEMADIR/02736_nested_structures:Message'" > $DATA_FILE + +$CLICKHOUSE_LOCAL -q "select * from file($DATA_FILE, CapnProto, 'nested_field1 UInt32, nested_nested_field1 UInt32, nested_nested_field2 UInt32') settings format_schema='$SCHEMADIR/02736_nested_structures:Message'" + +$CLICKHOUSE_LOCAL -q "select [42, 24] as nestedList_field1, [42, 24] as nestedList_nested_field1, [42, 24] as nestedList_nested_field2, [[42, 24], [24, 42]] as nestedList_nestedList_field1, [[42, 24], [24, 42]] as nestedList_nestedList_field2 format CapnProto settings format_schema='$SCHEMADIR/02736_nested_structures:Message'" > $DATA_FILE + +$CLICKHOUSE_LOCAL -q "select * from file($DATA_FILE, CapnProto, 'nestedList_field1 Array(UInt32), nestedList_nested_field1 Array(UInt32), nestedList_nested_field2 Array(UInt32), nestedList_nestedList_field1 Array(Array(UInt32)), nestedList_nestedList_field2 Array(Array(UInt32))') settings format_schema='$SCHEMADIR/02736_nested_structures:Message'" + +rm $DATA_FILE + diff --git a/tests/queries/0_stateless/format_schemas/02735_case_insensitive_names_matching.capnp b/tests/queries/0_stateless/format_schemas/02735_case_insensitive_names_matching.capnp new file mode 100644 index 00000000000..6b12aab081a --- /dev/null +++ b/tests/queries/0_stateless/format_schemas/02735_case_insensitive_names_matching.capnp @@ -0,0 +1,13 @@ +@0x9ef128e10a8010b8; + +struct Nested +{ + field1 @0 : UInt32; + field2 @1 : UInt32; +} + +struct Message +{ + field1 @0 : UInt32; + nested @1 : Nested; +} diff --git a/tests/queries/0_stateless/format_schemas/02736_nested_structures.capnp b/tests/queries/0_stateless/format_schemas/02736_nested_structures.capnp new file mode 100644 index 00000000000..a03eb27f383 --- /dev/null +++ b/tests/queries/0_stateless/format_schemas/02736_nested_structures.capnp @@ -0,0 +1,21 @@ +@0x9ef128e10a8010b8; + +struct Nested2 +{ + field1 @0 : UInt32; + field2 @1 : UInt32; +} + +struct Nested +{ + field1 @0 : UInt32; + nested @1 : Nested2; + nestedList @2 : List(Nested2); +} + +struct Message +{ + nested @0 : Nested; + nestedList @1 : List(Nested); +} + From 167516b6b088dea2e6b44a6e81caf2c088f2481e Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 10 May 2023 21:07:56 +0200 Subject: [PATCH 0065/1072] Fix style --- src/Formats/CapnProtoSerializer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp index e0c8ae2a79a..c31623286d0 100644 --- a/src/Formats/CapnProtoSerializer.cpp +++ b/src/Formats/CapnProtoSerializer.cpp @@ -26,7 +26,7 @@ namespace DB namespace ErrorCodes { extern const int THERE_IS_NO_COLUMN; - extern const int BAD_TYPE_OF_FIELD; + extern const int LOGICAL_ERROR; extern const int CAPN_PROTO_BAD_CAST; extern const int INCORRECT_DATA; extern const int ILLEGAL_COLUMN; @@ -293,7 +293,7 @@ namespace return capnp::DynamicValue::Reader(capnp::DynamicEnum(enumerant)); } - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot convert CLickHouse Enum value to CapnProto Enum"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot convert ClickHouse Enum value to CapnProto Enum"); } void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override From 344885dd27fbc652a5e93040ead764cf88232bbf Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 10 May 2023 21:08:12 +0200 Subject: [PATCH 0066/1072] Fix style --- src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp index e686ae86997..c056ee2b4a4 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp @@ -14,7 +14,6 @@ namespace DB namespace ErrorCodes { - extern const int LOGICAL_ERROR; extern const int INCORRECT_DATA; } From e3cb1e40e4830d9b5499c2c410e1d12add4cc90f Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 10 May 2023 21:08:31 +0200 Subject: [PATCH 0067/1072] Fix style --- src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp b/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp index 7dd18be27f4..66a7160dd89 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp @@ -11,12 +11,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - - CapnProtoOutputStream::CapnProtoOutputStream(WriteBuffer & out_) : out(out_) { } From 604bd24995b411cbf405e79a48b555beda604b6e Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 11 May 2023 11:58:08 +0000 Subject: [PATCH 0068/1072] Refactor, remove no more needed arguments --- .../functions/other-functions.md | 40 +- src/Functions/generateRandomStructure.cpp | 480 +++++++----------- .../02586_generate_random_structure.reference | 11 +- .../02586_generate_random_structure.sql | 26 +- 4 files changed, 189 insertions(+), 368 deletions(-) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 43330b75b8f..e235a3db393 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2617,18 +2617,13 @@ Generates random table structure in a format `column1_name column1_type, column2 **Syntax** ``` sql -generateRandomStructure([number_of_columns, seed, allow_big_numbers, allow_enums, allow_decimals, allow_ip, allow_only_string_map_keys]) +generateRandomStructure([number_of_columns, seed]) ``` **Arguments** - `number_of_columns` — The desired number of columns in the result table structure. If set to 0 or `Null`, the number of columns will be random from 1 to 128. Default value: `Null`. - `seed` - Random seed to produce stable results. If seed is not specified or set to `Null`, it is randomly generated. -- `allow_big_numbers` - Indicates if big number types (`Int128/UInt128/Int256/UInt256/Decimal128/Decinal256`) can be generated. Default value: true. -- `allow_enums` - Indicates if enum types (`Enum8/Enum16`) can be generated. Default - true. -- `allow_decimals` - Indicates if decimal types (`Decimal(P, S)`) can be generated. Default - true. -- `allow_ip` - Indicates if ip types (`IPv4/IPv6`) can be generated. Default - true. -- `allow_only_string_map_keys` - Indicates if Map key type can be only `String/FixedString`. Default - false. All arguments must be constant. @@ -2671,41 +2666,16 @@ Result: Query: ``` sql -SELECT generateRandomStructure(Null, 11) +SELECT generateRandomStructure(NULL, 33) ``` Result: ``` text -┌─generateRandomStructure(0, 11)──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ c1 Date32, c2 String, c3 IPv6, c4 DateTime, c5 UInt16, c6 Tuple(e1 UInt32, e2 Date, e3 Date, e4 IPv6, e5 Nested(e1 DateTime, e2 FixedString(110), e3 Int256, e4 Array(Decimal64(4)), e5 Decimal128(18), e6 Enum16('v0' = 0, 'v1' = 1, 'v2' = 2, 'v3' = 3, 'v4' = 4)), e6 DateTime64(4)), c7 DateTime, c8 DateTime64(6), c9 Bool │ -└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +┌─generateRandomStructure(NULL, 33)─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ c1 DateTime, c2 Enum8('c2V0' = 0, 'c2V1' = 1, 'c2V2' = 2, 'c2V3' = 3), c3 LowCardinality(Nullable(FixedString(30))), c4 Int16, c5 Enum8('c5V0' = 0, 'c5V1' = 1, 'c5V2' = 2, 'c5V3' = 3), c6 Nullable(UInt8), c7 String, c8 Nested(e1 IPv4, e2 UInt8, e3 UInt16, e4 UInt16, e5 Int32, e6 Map(Date, Decimal256(70))) │ +└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` -``` sql -SELECT generateRandomStructure(6, Null, false, false) -``` - -Result: - -``` text -┌─generateRandomStructure(6, NULL, false, false)───────────────────────────────────────────────────────┐ -│ c1 Float32, c2 Tuple(DateTime), c3 UInt8, c4 UInt16, c5 Int64, c6 Array(Map(FixedString(108), Date)) │ -└──────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -``` sql -SELECT generateRandomStructure(6, Null, false, false, false, false, true) -``` - -Result: - -``` text -┌─generateRandomStructure(6, NULL, false, false, false, false, true)─────────────────────────────────────────────────┐ -│ c1 String, c2 UInt32, c3 Int32, c4 Int32, c5 Tuple(LowCardinality(Nullable(FixedString(101))), UInt8), c6 DateTime │ -└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` - - This function can be used together with [generateRandom](../../sql-reference/table-functions/generate.md) to generate completely random tables. diff --git a/src/Functions/generateRandomStructure.cpp b/src/Functions/generateRandomStructure.cpp index e6766e731b2..9fe321365e5 100644 --- a/src/Functions/generateRandomStructure.cpp +++ b/src/Functions/generateRandomStructure.cpp @@ -1,5 +1,3 @@ -#include "config.h" - #include #include #include @@ -24,130 +22,69 @@ namespace ErrorCodes class FunctionGenerateRandomStructure : public IFunction { private: - enum class Type + static constexpr std::array simple_types { - Int8, - UInt8, - Bool, - Int16, - UInt16, - Int32, - UInt32, - Int64, - UInt64, - Float32, - Float64, - DateTime64, - Decimal32, - Decimal64, - Date, - Date32, - DateTime, - String, - FixedString, - IPv4, - IPv6, - Int128, - UInt128, - Int256, - UInt256, - Decimal128, - Decimal256, - Enum8, - Enum16, - Nullable, - LowCardinality, - Array, - Tuple, - Map, - Nested, + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Float32, + TypeIndex::Float64, + TypeIndex::Decimal32, + TypeIndex::Decimal64, + TypeIndex::Decimal128, + TypeIndex::Decimal256, + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::DateTime64, + TypeIndex::String, + TypeIndex::FixedString, + TypeIndex::Enum8, + TypeIndex::Enum16, + TypeIndex::IPv4, + TypeIndex::IPv6, + TypeIndex::UUID, }; - static constexpr std::array simple_types + static constexpr std::array complex_types { - Type::Int8, - Type::UInt8, - Type::Bool, - Type::Int16, - Type::UInt16, - Type::Int32, - Type::UInt32, - Type::Int64, - Type::UInt64, - Type::Float32, - Type::Float64, - Type::Date, - Type::Date32, - Type::DateTime, - Type::String, - Type::FixedString, + TypeIndex::Nullable, + TypeIndex::LowCardinality, + TypeIndex::Array, + TypeIndex::Tuple, + TypeIndex::Map, }; - static constexpr std::array big_integer_types + static constexpr std::array map_key_types { - Type::Int128, - Type::UInt128, - Type::Int256, - Type::UInt256, - }; - - static constexpr std::array decimal_types - { - Type::DateTime64, - Type::Decimal32, - Type::Decimal64, - }; - - static constexpr std::array big_decimal_types - { - Type::Decimal128, - Type::Decimal256, - }; - - static constexpr std::array enum_types - { - Type::Enum8, - Type::Enum16, - }; - - static constexpr std::array ip_types - { - Type::IPv4, - Type::IPv6, - }; - - static constexpr std::array complex_types - { - Type::Nullable, - Type::LowCardinality, - Type::Array, - Type::Tuple, - Type::Map, - Type::Nested, - }; - - static constexpr std::array map_key_types - { - Type::Int8, - Type::UInt8, - Type::Bool, - Type::Int16, - Type::UInt16, - Type::Int32, - Type::UInt32, - Type::Int64, - Type::UInt64, - Type::Date, - Type::Date32, - Type::DateTime, - Type::String, - Type::FixedString, - }; - - static constexpr std::array map_key_string_types - { - Type::String, - Type::FixedString + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::String, + TypeIndex::FixedString, + TypeIndex::IPv4, + TypeIndex::UUID, }; static constexpr size_t MAX_NUMBER_OF_COLUMNS = 128; @@ -175,18 +112,19 @@ public: bool isDeterministic() const override { return false; } bool isDeterministicInScopeOfQuery() const override { return false; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1, 2, 3, 4, 5, 6}; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } bool useDefaultImplementationForConstants() const override { return false; } bool useDefaultImplementationForNulls() const override { return false; } DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (arguments.size() > 7) + if (arguments.size() > 2) throw Exception( ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, expected from 0 to 7", + "Number of arguments for function {} doesn't match: passed {}, expected from 0 to 2", getName(), arguments.size()); + for (size_t i = 0; i != 2; ++i) { if (arguments.size() == i) @@ -203,22 +141,6 @@ public: } } - for (size_t i = 2; i != 7; ++i) - { - if (arguments.size() <= i) - break; - - if (!isUInt8(arguments[i])) - { - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the {} argument of function {}, expected UInt8", - i + 1, - arguments[i]->getName(), - getName()); - } - } - return std::make_shared(); } @@ -241,40 +163,27 @@ public: if (arguments.size() > 1 && !arguments[1].column->onlyNull()) seed = arguments[1].column->getUInt(0); - bool allow_big_numbers = true; - if (arguments.size() > 2) - allow_big_numbers = arguments[2].column->getBool(0); - - bool allow_enums = true; - if (arguments.size() > 3) - allow_enums = arguments[3].column->getBool(0); - - bool allow_decimals = true; - if (arguments.size() > 4) - allow_decimals = arguments[4].column->getBool(0); - - bool allow_ip = true; - if (arguments.size() > 5) - allow_ip = arguments[5].column->getBool(0); - - bool only_string_map_key = false; - if (arguments.size() > 6) - only_string_map_key = arguments[6].column->getBool(0); - pcg64 rng(seed); if (number_of_columns == 0) number_of_columns = generateNumberOfColumns(rng); auto col_res = ColumnString::create(); - String generated_structure; + auto & string_column = assert_cast(*col_res); + auto & chars = string_column.getChars(); + WriteBufferFromVector buf(chars); for (size_t i = 0; i != number_of_columns; ++i) { if (i != 0) - generated_structure += ", "; - auto type = generateRandomType(rng, allow_big_numbers, allow_enums, allow_decimals, allow_ip, only_string_map_key); - generated_structure += "c" + std::to_string(i + 1) + " " + type; + writeCString(", ", buf); + String column_name = "c" + std::to_string(i + 1); + writeString(column_name, buf); + writeChar(' ', buf); + writeRandomType(column_name, rng, buf); } - col_res->insert(generated_structure); + + buf.finalize(); + chars.push_back(0); + string_column.getOffsets().push_back(chars.size()); return ColumnConst::create(std::move(col_res), input_rows_count); } @@ -285,181 +194,159 @@ private: return rng() % MAX_NUMBER_OF_COLUMNS + 1; } - /// Helper struct to call generateRandomTypeImpl with lots of bool template arguments without writing big if/else over all bool variables. - template - struct Dispatcher + template + void writeRandomType(const String & column_name, pcg64 & rng, WriteBuffer & buf, size_t depth = 0) const { - static auto call(const FunctionGenerateRandomStructure * f, pcg64 & rng) - { - return f->generateRandomTypeImpl(rng); - } - - template - static auto call(const FunctionGenerateRandomStructure * f, pcg64 & rng, bool b, Args1... ar1) - { - if (b) - return Dispatcher::call(f, rng, ar1...); - else - return Dispatcher::call(f, rng, ar1...); - } - - friend FunctionGenerateRandomStructure; - }; - - String generateRandomType(pcg64 & rng, bool allow_big_numbers, bool allow_enums, bool allow_decimals, bool allow_ip, bool allow_only_string_map_keys) const - { - return Dispatcher<>::call(this, rng, allow_big_numbers, allow_enums, allow_decimals, allow_ip, allow_only_string_map_keys, true); - } - - template - String generateRandomTypeImpl(pcg64 & rng, size_t depth = 0) const - { - constexpr auto all_types = getAllTypes(); + constexpr auto all_types = getAllTypes(); auto type = all_types[rng() % all_types.size()]; switch (type) { - case Type::FixedString: - return "FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")"; - case Type::DateTime64: - return "DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION) + ")"; - case Type::Decimal32: - return "Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION) + ")"; - case Type::Decimal64: - return "Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION) + ")"; - case Type::Decimal128: - return "Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION) + ")"; - case Type::Decimal256: - return "Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION) + ")"; - case Type::Enum8: - return "Enum8(" + generateEnumValues(rng) + ")"; - case Type::Enum16: - return "Enum16(" + generateEnumValues(rng) + ")"; - case Type::LowCardinality: - return "LowCardinality(" + generateLowCardinalityNestedType(rng) + ")"; - case Type::Nullable: + case TypeIndex::UInt8: + if (rng() % 2) + writeCString("UInt8", buf); + else + writeCString("Bool", buf); + return; + case TypeIndex::FixedString: + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + return; + case TypeIndex::DateTime64: + writeString("DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION) + ")", buf); + return; + case TypeIndex::Decimal32: + writeString("Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION) + ")", buf); + return; + case TypeIndex::Decimal64: + writeString("Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION) + ")", buf); + return; + case TypeIndex::Decimal128: + writeString("Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION) + ")", buf); + return; + case TypeIndex::Decimal256: + writeString("Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION) + ")", buf); + return; + case TypeIndex::Enum8: + writeCString("Enum8(", buf); + writeEnumValues(column_name, rng, buf); + writeChar(')', buf); + return; + case TypeIndex::Enum16: + writeCString("Enum16(", buf); + writeEnumValues(column_name, rng, buf); + writeChar(')', buf); + return; + case TypeIndex::LowCardinality: + writeCString("LowCardinality(", buf); + writeLowCardinalityNestedType(rng, buf); + writeChar(')', buf); + return; + case TypeIndex::Nullable: { - auto nested_type = generateRandomTypeImpl(rng, depth + 1); - return "Nullable(" + nested_type + ")"; + writeCString("Nullable(", buf); + writeRandomType(column_name, rng, buf, depth + 1); + writeChar(')', buf); + return; } - case Type::Array: + case TypeIndex::Array: { - auto nested_type = generateRandomTypeImpl(rng, depth + 1); - return "Array(" + nested_type + ")"; + writeCString("Array(", buf); + writeRandomType(column_name, rng, buf, depth + 1); + writeChar(')', buf); + return; } - case Type::Map: + case TypeIndex::Map: { - auto key_type = generateMapKeyType(rng); - auto value_type = generateRandomTypeImpl(rng, depth + 1); - return "Map(" + key_type + ", " + value_type + ")"; + writeCString("Map(", buf); + writeMapKeyType(rng, buf); + writeCString(", ", buf); + writeRandomType(column_name, rng, buf, depth + 1); + writeChar(')', buf); + return; } - case Type::Tuple: + case TypeIndex::Tuple: { size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1; - bool named_tuple = rng() % 2; - String tuple_type = "Tuple("; + bool generate_nested = rng() % 2; + bool generate_named_tuple = rng() % 2; + if (generate_nested) + writeCString("Nested(", buf); + else + writeCString("Tuple(", buf); + for (size_t i = 0; i != elements; ++i) { if (i != 0) - tuple_type += ", "; - if (named_tuple) - tuple_type += "e" + std::to_string(i + 1) + " "; - tuple_type += generateRandomTypeImpl(rng, depth + 1); + writeCString(", ", buf); + + String element_name = "e" + std::to_string(i + 1); + if (generate_named_tuple || generate_nested) + { + writeString(element_name, buf); + writeChar(' ', buf); + } + writeRandomType(element_name, rng, buf, depth + 1); } - return tuple_type + ")"; - } - case Type::Nested: - { - size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1; - String nested_type = "Nested("; - for (size_t i = 0; i != elements; ++i) - { - if (i != 0) - nested_type += ", "; - auto element_type = generateRandomTypeImpl(rng, depth + 1); - nested_type += "e" + std::to_string(i + 1) + " " + element_type; - } - return nested_type + ")"; + writeChar(')', buf); + return; } default: - return String(magic_enum::enum_name(type)); + writeString(magic_enum::enum_name(type), buf); + return; } } - template - String generateMapKeyType(pcg64 & rng) const + void writeMapKeyType(pcg64 & rng, WriteBuffer & buf) const { - Type type; - if constexpr (allow_only_string_map_keys) - type = map_key_string_types[rng() % map_key_string_types.size()]; + TypeIndex type = map_key_types[rng() % map_key_types.size()]; + if (type == TypeIndex::FixedString) + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); else - type = map_key_types[rng() % map_key_types.size()]; - - if (type == Type::FixedString) - return "FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")"; - return String(magic_enum::enum_name(type)); + writeString(magic_enum::enum_name(type), buf); } - String generateLowCardinalityNestedType(pcg64 & rng) const + void writeLowCardinalityNestedType(pcg64 & rng, WriteBuffer & buf) const { - /// Support only String and FixedString. + /// Support only String and FixedString (maybe Nullable). String nested_type; + bool make_nullable = rng() % 2; + if (make_nullable) + writeCString("Nullable(", buf); + if (rng() % 2) - nested_type = "String"; + writeCString("String", buf); else - nested_type = "FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")"; - return rng() % 2 ? nested_type : "Nullable(" + nested_type + ")"; + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + + if (make_nullable) + writeChar(')', buf); } - String generateEnumValues(pcg64 & rng) const + void writeEnumValues(const String & column_name, pcg64 & rng, WriteBuffer & buf) const { - /// Don't generate big enums, because it will lead to really big strings + /// Don't generate big enums, because it will lead to really big result /// and slowness of this function, and it can lead to `Max query size exceeded` /// while using this function with generateRandom. ssize_t num_values = rng() % 16 + 1; - String result; for (ssize_t i = 0; i != num_values; ++i) { if (i != 0) - result += ", "; - result += "'v" + std::to_string(i) + "' = " + std::to_string(i); + writeCString(", ", buf); + writeString("'" + column_name + "V" + std::to_string(i) + "' = " + std::to_string(i), buf); } - return result; } - template + template static constexpr auto getAllTypes() { - constexpr size_t big_integer_types_size = big_integer_types.size() * allow_big_numbers; - constexpr size_t enum_types_size = enum_types.size() * allow_enums; - constexpr size_t decimal_types_size = decimal_types.size() * allow_decimals; - constexpr size_t big_decimal_types_size = big_decimal_types.size() * allow_big_numbers * allow_decimals; - constexpr size_t ip_types_size = ip_types.size() * allow_ip; constexpr size_t complex_types_size = complex_types.size() * allow_complex_types; - - constexpr size_t result_size = simple_types.size() + big_integer_types_size + enum_types_size + decimal_types_size - + big_decimal_types_size + ip_types_size + complex_types_size; - std::array result; + constexpr size_t result_size = simple_types.size() + complex_types_size; + std::array result; size_t index = 0; for (size_t i = 0; i != simple_types.size(); ++i, ++index) result[index] = simple_types[i]; - for (size_t i = 0; i != big_integer_types_size; ++i, ++index) - result[index] = big_integer_types[i]; - - for (size_t i = 0; i != enum_types_size; ++i, ++index) - result[index] = enum_types[i]; - - for (size_t i = 0; i != decimal_types_size; ++i, ++index) - result[index] = decimal_types[i]; - - for (size_t i = 0; i != big_decimal_types_size; ++i, ++index) - result[index] = big_decimal_types[i]; - - for (size_t i = 0; i != ip_types_size; ++i, ++index) - result[index] = ip_types[i]; - for (size_t i = 0; i != complex_types_size; ++i, ++index) result[index] = complex_types[i]; @@ -474,26 +361,15 @@ REGISTER_FUNCTION(GenerateRandomStructure) { R"( Generates a random table structure. -This function takes 4 optional constant arguments: -1) the number of column in the result structure (random by default) -2) random seed (random by default) -3) flag that indicates if big number types can be used (true by default) -4) flag that indicates if enum types can be used (true by default) -5) flag that indicates if decimal types can be used (true by default) -6) flag that indicates if ip types (IPv4, IPv6) can be used (true by default) -7) flag that indicates if map keys should be only String or FixedString (false by default) +This function takes 2 optional constant arguments: +the number of columns in the result structure (random by default) and random seed (random by default) The maximum number of columns is 128. The function returns a value of type String. )", Documentation::Examples{ {"random", "SELECT generateRandomStructure()"}, - {"with specified number of arguments", "SELECT generateRandomStructure(10)"}, + {"with specified number of columns", "SELECT generateRandomStructure(10)"}, {"with specified seed", "SELECT generateRandomStructure(10, 42)"}, - {"without big number types", "SELECT generateRandomStructure(10, NULL, false)"}, - {"without enum types", "SELECT generateRandomStructure(10, NULL, true, false)"}, - {"without decimal types", "SELECT generateRandomStructure(10, NULL, true, true, false)"}, - {"without ip types", "SELECT generateRandomStructure(10, NULL, true, true, true, false)"}, - {"with only string mak key types", "SELECT generateRandomStructure(10, NULL, true, true, true, true, true)"}, }, Documentation::Categories{"Random"} }, diff --git a/tests/queries/0_stateless/02586_generate_random_structure.reference b/tests/queries/0_stateless/02586_generate_random_structure.reference index 76d89828071..65bdc530f10 100644 --- a/tests/queries/0_stateless/02586_generate_random_structure.reference +++ b/tests/queries/0_stateless/02586_generate_random_structure.reference @@ -1,11 +1,4 @@ -c1 Date, c2 Bool, c3 Int16, c4 Map(Int64, Array(Bool)), c5 Decimal256(30) -c1 String, c2 Float64, c3 Enum8(\'v0\' = 0, \'v1\' = 1, \'v2\' = 2, \'v3\' = 3, \'v4\' = 4), c4 UInt64, c5 Date -c1 Nested(e1 Int64, e2 Int16, e3 Map(Int16, LowCardinality(Nullable(String))), e4 UInt8, e5 Nested(e1 Array(Nullable(DateTime)), e2 Nullable(Bool), e3 UInt8, e4 UInt64, e5 Decimal64(6), e6 DateTime), e6 LowCardinality(Nullable(String))), c2 Date, c3 Int32, c4 IPv4, c5 Decimal32(8) -c1 Date, c2 UInt16, c3 UInt256, c4 Nullable(IPv4), c5 Nullable(Decimal64(17)) -c1 Array(Int64), c2 Map(String, LowCardinality(String)), c3 Date, c4 Map(Int64, UInt128), c5 UInt8 -c1 Date, c2 UInt16, c3 UInt256, c4 Nullable(Decimal128(37)), c5 DateTime64(8) -c1 Date, c2 Bool, c3 Int16, c4 Map(FixedString(120), Bool), c5 Decimal256(30) -c1 String, c2 Float64, c3 Enum8(\'v0\' = 0, \'v1\' = 1, \'v2\' = 2, \'v3\' = 3, \'v4\' = 4), c4 UInt64, c5 Date +c1 String, c2 UInt256, c3 String, c4 Decimal128(7), c5 UInt128 String Const(String) -1977-07-28 true 5389 +` 90465455320735604871982424534384518837533904778028808627865442405232847164685 5& -3034771008825448884614719061068.2821046 75820566154622566322847299106656624693 diff --git a/tests/queries/0_stateless/02586_generate_random_structure.sql b/tests/queries/0_stateless/02586_generate_random_structure.sql index 061fbc24219..a28c159cff5 100644 --- a/tests/queries/0_stateless/02586_generate_random_structure.sql +++ b/tests/queries/0_stateless/02586_generate_random_structure.sql @@ -1,29 +1,11 @@ select generateRandomStructure(5, 42); -select generateRandomStructure(5, 42, false); -select generateRandomStructure(5, 42, false, false); -select generateRandomStructure(5, 42, true, false); -select generateRandomStructure(5, 42, true, true, false); -select generateRandomStructure(5, 42, true, true, true, false); -select generateRandomStructure(5, 42, true, true, true, true, true); -select generateRandomStructure(5, 42, false, true, true); select toTypeName(generateRandomStructure(5, 42)); select toColumnTypeName(generateRandomStructure(5, 42)); -SELECT * FROM generateRandom(generateRandomStructure(3, 42), 42) LIMIT 1; +SELECT * FROM generateRandom(generateRandomStructure(5, 42), 42) LIMIT 1; -select generateRandomStructure(5, 42, false, false, false, false, true, 42); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +select generateRandomStructure(5, 42, 42); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} select generateRandomStructure('5'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} select generateRandomStructure(5, '42'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} -select generateRandomStructure(5, 42, 'false'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} -select generateRandomStructure(5, 42, false, 'false'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} -select generateRandomStructure(5, 42, false, false, 'false'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} -select generateRandomStructure(5, 42, false, false, false, 'false'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} -select generateRandomStructure(5, 42, false, false, false, false, 'false'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} -select generateRandomStructure(materialize(5), 42, false, false); -- {serverError ILLEGAL_COLUMN} -select generateRandomStructure(5, materialize(42), false, false); -- {serverError ILLEGAL_COLUMN} -select generateRandomStructure(5, 42, materialize(false), false); -- {serverError ILLEGAL_COLUMN} -select generateRandomStructure(5, 42, false, materialize(false)); -- {serverError ILLEGAL_COLUMN} -select generateRandomStructure(5, 42, false, false, materialize(false)); -- {serverError ILLEGAL_COLUMN} -select generateRandomStructure(5, 42, false, false, false, materialize(false)); -- {serverError ILLEGAL_COLUMN} -select generateRandomStructure(5, 42, false, false, false, materialize(false)); -- {serverError ILLEGAL_COLUMN} -select generateRandomStructure(5, 42, false, false, false, false, materialize(false)); -- {serverError ILLEGAL_COLUMN} +select generateRandomStructure(materialize(5), 42); -- {serverError ILLEGAL_COLUMN} +select generateRandomStructure(5, materialize(42)); -- {serverError ILLEGAL_COLUMN} From 9d1ee044b080237df41ff40f3376d433e6b274a4 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 11 May 2023 12:02:00 +0000 Subject: [PATCH 0069/1072] Update example --- docs/en/sql-reference/table-functions/generate.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/table-functions/generate.md b/docs/en/sql-reference/table-functions/generate.md index 7a17c61eeca..b2777418e4f 100644 --- a/docs/en/sql-reference/table-functions/generate.md +++ b/docs/en/sql-reference/table-functions/generate.md @@ -56,15 +56,15 @@ SELECT * FROM random; In combination with [generateRandomStructure](../../sql-reference/functions/other-functions.md#generateRandomStructure): ```sql -SELECT * FROM generateRandom(generateRandomStructure(3, 24), 24) LIMIT 3; +SELECT * FROM generateRandom(generateRandomStructure(4, 101), 101) LIMIT 3; ``` ```text -┌─────────────────────────c1─┬─────c2─┬───────────────────c3─┬───────────────────────────────────────c4─┐ -│ 2085-07-05 23:48:43.345759 │ -20656 │ 1632406185424686785 │ -210464718903845545171230673454802.15238 │ -│ 1971-07-17 16:32:36.390777 │ -27071 │ -1553021742787219162 │ 1095158319964381336405161704296125.08074 │ -│ 2024-02-19 13:14:32.902513 │ 24913 │ 7727442383333447640 │ 1090748832613398997057187200834127.07109 │ -└────────────────────────────┴────────┴──────────────────────┴──────────────────────────────────────────┘ +┌──────────────────c1─┬──────────────────c2─┬─c3─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─c4──────────────────────────────────────┐ +│ 1996-04-15 06:40:05 │ 33954608387.2844801 │ ['232.78.216.176','9.244.59.211','211.21.80.152','44.49.94.109','165.77.195.182','68.167.134.239','212.13.24.185','1.197.255.35','192.55.131.232'] │ 45d9:2b52:ab6:1c59:185b:515:c5b6:b781 │ +│ 2063-01-13 01:22:27 │ 36155064970.9514454 │ ['176.140.188.101'] │ c65a:2626:41df:8dee:ec99:f68d:c6dd:6b30 │ +│ 2090-02-28 14:50:56 │ 3864327452.3901373 │ ['155.114.30.32'] │ 57e9:5229:93ab:fbf3:aae7:e0e4:d1eb:86b │ +└─────────────────────┴─────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────┘ ``` ## Related content From 288988b59912173883daa757fff394fc5f40b497 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 11 May 2023 12:08:50 +0000 Subject: [PATCH 0070/1072] Fix build --- src/Formats/CapnProtoSerializer.cpp | 6 ++++++ src/Formats/CapnProtoSerializer.h | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp index c31623286d0..00ccfc7717d 100644 --- a/src/Formats/CapnProtoSerializer.cpp +++ b/src/Formats/CapnProtoSerializer.cpp @@ -1,3 +1,7 @@ +#include "config.h" + +#if USE_CAPNP + #include #include #include @@ -1216,3 +1220,5 @@ void CapnProtoSerializer::readRow(MutableColumns & columns, capnp::DynamicStruct CapnProtoSerializer::~CapnProtoSerializer() = default; } + +#endif diff --git a/src/Formats/CapnProtoSerializer.h b/src/Formats/CapnProtoSerializer.h index efae797875b..692f5e5301f 100644 --- a/src/Formats/CapnProtoSerializer.h +++ b/src/Formats/CapnProtoSerializer.h @@ -1,5 +1,7 @@ #pragma once +#if USE_CAPNP + #include #include @@ -23,3 +25,5 @@ private: }; } + +#endif From 5ed1b12e19939e399dccc1776901c4e4ddede29a Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 11 May 2023 12:12:43 +0000 Subject: [PATCH 0071/1072] Fix build --- src/Functions/generateRandomStructure.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Functions/generateRandomStructure.cpp b/src/Functions/generateRandomStructure.cpp index 9fe321365e5..eccccdf563f 100644 --- a/src/Functions/generateRandomStructure.cpp +++ b/src/Functions/generateRandomStructure.cpp @@ -22,7 +22,7 @@ namespace ErrorCodes class FunctionGenerateRandomStructure : public IFunction { private: - static constexpr std::array simple_types + static constexpr std::array simple_types { TypeIndex::Int8, TypeIndex::UInt8, @@ -64,7 +64,7 @@ private: TypeIndex::Map, }; - static constexpr std::array map_key_types + static constexpr std::array map_key_types { TypeIndex::Int8, TypeIndex::UInt8, From 1daa9811222c80f2d957556d32be4f3309034e4b Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 12 May 2023 16:12:01 +0200 Subject: [PATCH 0072/1072] Fix special builds --- src/Formats/CapnProtoSerializer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp index 00ccfc7717d..091e70da656 100644 --- a/src/Formats/CapnProtoSerializer.cpp +++ b/src/Formats/CapnProtoSerializer.cpp @@ -1007,7 +1007,7 @@ namespace catch (Exception & e) { e.addMessage("(while converting column {})", column_name); - throw e; + throw std::move(e); } } @@ -1015,7 +1015,7 @@ namespace { assert(builder); auto & struct_builder = assert_cast(*builder); - if (auto tuple_column = typeid_cast(column.get())) + if (auto * tuple_column = typeid_cast(column.get())) writeRow(tuple_column->getColumnsCopy(), struct_builder, row_num); else writeRow(Columns{column}, struct_builder, row_num); From 7698776d2aae48880e45d8a791b7474b5ef20999 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 12 May 2023 18:53:51 +0200 Subject: [PATCH 0073/1072] Fix special build --- src/Formats/CapnProtoSchema.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Formats/CapnProtoSchema.cpp b/src/Formats/CapnProtoSchema.cpp index 22518d5061a..f9ab88d39ed 100644 --- a/src/Formats/CapnProtoSchema.cpp +++ b/src/Formats/CapnProtoSchema.cpp @@ -151,7 +151,7 @@ namespace { template - static DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants) + DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants) { std::vector> values; for (auto enumerant : enumerants) @@ -159,7 +159,7 @@ namespace return std::make_shared>(std::move(values)); } - static DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema) + DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema) { auto enumerants = enum_schema.getEnumerants(); if (enumerants.size() < 128) @@ -170,7 +170,7 @@ namespace throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "ClickHouse supports only 8 and 16-bit Enums"); } - static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type, bool skip_unsupported_fields) + DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type, bool skip_unsupported_fields) { switch (capnp_type.which()) { From c9f90fb9adba16ad1131a16d46a6efefdd532379 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 12 May 2023 18:54:38 +0200 Subject: [PATCH 0074/1072] Fix special build --- src/Formats/CapnProtoSerializer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp index 091e70da656..ff3880976c7 100644 --- a/src/Formats/CapnProtoSerializer.cpp +++ b/src/Formats/CapnProtoSerializer.cpp @@ -158,7 +158,7 @@ namespace }; template - static std::unique_ptr createIntegerSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + std::unique_ptr createIntegerSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) { switch (capnp_type.which()) { @@ -1015,7 +1015,7 @@ namespace { assert(builder); auto & struct_builder = assert_cast(*builder); - if (auto * tuple_column = typeid_cast(column.get())) + if (const auto * tuple_column = typeid_cast(column.get())) writeRow(tuple_column->getColumnsCopy(), struct_builder, row_num); else writeRow(Columns{column}, struct_builder, row_num); From 602b9a740ec2bf3064d2970c54fbc92da9304991 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 12 May 2023 19:39:33 +0000 Subject: [PATCH 0075/1072] Make better, allow generateRandom without structure argument --- .../sql-reference/table-functions/generate.md | 30 +- ...pp => FunctionGenerateRandomStructure.cpp} | 402 ++++++++++-------- .../FunctionGenerateRandomStructure.h | 45 ++ .../TableFunctionGenerateRandom.cpp | 67 ++- .../02586_generate_random_structure.reference | 16 +- .../02586_generate_random_structure.sql | 6 + 6 files changed, 369 insertions(+), 197 deletions(-) rename src/Functions/{generateRandomStructure.cpp => FunctionGenerateRandomStructure.cpp} (51%) create mode 100644 src/Functions/FunctionGenerateRandomStructure.h diff --git a/docs/en/sql-reference/table-functions/generate.md b/docs/en/sql-reference/table-functions/generate.md index b2777418e4f..6ceeb63cbb3 100644 --- a/docs/en/sql-reference/table-functions/generate.md +++ b/docs/en/sql-reference/table-functions/generate.md @@ -11,7 +11,7 @@ Allows to populate test tables with data. Not all types are supported. ``` sql -generateRandom('name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_string_length'[, 'max_array_length']]]) +generateRandom(['name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_string_length'[, 'max_array_length']]]]) ``` **Arguments** @@ -67,5 +67,33 @@ SELECT * FROM generateRandom(generateRandomStructure(4, 101), 101) LIMIT 3; └─────────────────────┴─────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────┘ ``` +With missing `structure` argument (in this case the structure is random): + +```sql +SELECT * FROM generateRandom() LIMIT 3; +``` + +```text +┌───c1─┬─────────c2─┬─────────────────────c3─┬──────────────────────c4─┬─c5───────┐ +│ -128 │ 317300854 │ 2030-08-16 08:22:20.65 │ 1994-08-16 12:08:56.745 │ R0qgiC46 │ +│ 40 │ -744906827 │ 2059-04-16 06:31:36.98 │ 1975-07-16 16:28:43.893 │ PuH4M*MZ │ +│ -55 │ 698652232 │ 2052-08-04 20:13:39.68 │ 1998-09-20 03:48:29.279 │ │ +└──────┴────────────┴────────────────────────┴─────────────────────────┴──────────┘ +``` + +With random seed both for random structure and random data: + +```sql +SELECT * FROM generateRandom(11) LIMIT 3; +``` + +```text +┌───────────────────────────────────────c1─┬─────────────────────────────────────────────────────────────────────────────c2─┬─────────────────────────────────────────────────────────────────────────────c3─┬─────────c4─┬─────────────────────────────────────────────────────────────────────────────c5─┬──────────────────────c6─┬─c7──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─c8──────────────────────────────────────┬─────────c9─┐ +│ -77422512305044606600216318673365695785 │ 636812099959807642229.503817849012019401335326013846687285151335352272727523 │ -34944452809785978175157829109276115789694605299387223845886143311647505037529 │ 544473976 │ 111220388331710079615337037674887514156741572807049614590010583571763691328563 │ 22016.22623506465 │ {'2052-01-31 20:25:33':4306400876908509081044405485378623663,'1993-04-16 15:58:49':164367354809499452887861212674772770279,'2101-08-19 03:07:18':-60676948945963385477105077735447194811,'2039-12-22 22:31:39':-59227773536703059515222628111999932330} │ a7b2:8f58:4d07:6707:4189:80cf:92f5:902d │ 1950-07-14 │ +│ -159940486888657488786004075627859832441 │ 629206527868163085099.8195700356331771569105231840157308480121506729741348442 │ -53203761250367440823323469081755775164053964440214841464405368882783634063735 │ 2187136525 │ 94881662451116595672491944222189810087991610568040618106057495823910493624275 │ 1.3095786748458954e-104 │ {} │ a051:e3da:2e0a:c69:7835:aed6:e8b:3817 │ 1943-03-25 │ +│ -5239084224358020595591895205940528518 │ -529937657954363597180.1709207212648004850138812370209091520162977548101577846 │ 47490343304582536176125359129223180987770215457970451211489086575421345731671 │ 1637451978 │ 101899445785010192893461828129714741298630410942962837910400961787305271699002 │ 2.4344456058391296e223 │ {'2013-12-22 17:42:43':80271108282641375975566414544777036006,'2041-03-08 10:28:17':169706054082247533128707458270535852845,'1986-08-31 23:07:38':-54371542820364299444195390357730624136,'2094-04-23 21:26:50':7944954483303909347454597499139023465} │ 1293:a726:e899:9bfc:8c6f:2aa1:22c9:b635 │ 1924-11-20 │ +└──────────────────────────────────────────┴────────────────────────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────────────────────────┴────────────┴────────────────────────────────────────────────────────────────────────────────┴─────────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────┴────────────┘ +``` + ## Related content - Blog: [Generating random data in ClickHouse](https://clickhouse.com/blog/generating-random-test-distribution-data-for-clickhouse) diff --git a/src/Functions/generateRandomStructure.cpp b/src/Functions/FunctionGenerateRandomStructure.cpp similarity index 51% rename from src/Functions/generateRandomStructure.cpp rename to src/Functions/FunctionGenerateRandomStructure.cpp index eccccdf563f..4cf783212cb 100644 --- a/src/Functions/generateRandomStructure.cpp +++ b/src/Functions/FunctionGenerateRandomStructure.cpp @@ -1,9 +1,10 @@ -#include -#include -#include +#include #include #include #include +#include +#include +#include #include #include @@ -19,10 +20,18 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -class FunctionGenerateRandomStructure : public IFunction +namespace { -private: - static constexpr std::array simple_types + const size_t MAX_NUMBER_OF_COLUMNS = 128; + const size_t MAX_TUPLE_ELEMENTS = 16; + const size_t MAX_DATETIME64_PRECISION = 9; + const size_t MAX_DECIMAL32_PRECISION = 9; + const size_t MAX_DECIMAL64_PRECISION = 18; + const size_t MAX_DECIMAL128_PRECISION = 38; + const size_t MAX_DECIMAL256_PRECISION = 76; + const size_t MAX_DEPTH = 32; + + constexpr std::array simple_types { TypeIndex::Int8, TypeIndex::UInt8, @@ -55,7 +64,7 @@ private: TypeIndex::UUID, }; - static constexpr std::array complex_types + constexpr std::array complex_types { TypeIndex::Nullable, TypeIndex::LowCardinality, @@ -64,7 +73,7 @@ private: TypeIndex::Map, }; - static constexpr std::array map_key_types + constexpr std::array map_key_types { TypeIndex::Int8, TypeIndex::UInt8, @@ -84,119 +93,147 @@ private: TypeIndex::String, TypeIndex::FixedString, TypeIndex::IPv4, + TypeIndex::Enum8, + TypeIndex::Enum16, + TypeIndex::UUID, + TypeIndex::LowCardinality, + }; + + constexpr std::array suspicious_lc_types + { + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Float32, + TypeIndex::Float64, + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::String, + TypeIndex::FixedString, + TypeIndex::IPv4, + TypeIndex::IPv6, TypeIndex::UUID, }; - static constexpr size_t MAX_NUMBER_OF_COLUMNS = 128; - static constexpr size_t MAX_TUPLE_ELEMENTS = 16; - static constexpr size_t MAX_DATETIME64_PRECISION = 9; - static constexpr size_t MAX_DECIMAL32_PRECISION = 9; - static constexpr size_t MAX_DECIMAL64_PRECISION = 18; - static constexpr size_t MAX_DECIMAL128_PRECISION = 38; - static constexpr size_t MAX_DECIMAL256_PRECISION = 76; - static constexpr size_t MAX_DEPTH = 32; - -public: - static constexpr auto name = "generateRandomStructure"; - - static FunctionPtr create(ContextPtr /*context*/) + template + constexpr auto getAllTypes() { - return std::make_shared(); + constexpr size_t complex_types_size = complex_types.size() * allow_complex_types; + constexpr size_t result_size = simple_types.size() + complex_types_size; + std::array result; + size_t index = 0; + + for (size_t i = 0; i != simple_types.size(); ++i, ++index) + result[index] = simple_types[i]; + + for (size_t i = 0; i != complex_types_size; ++i, ++index) + result[index] = complex_types[i]; + + return result; } - String getName() const override { return name; } - - size_t getNumberOfArguments() const override { return 0; } - - bool isVariadic() const override { return true; } - bool isDeterministic() const override { return false; } - bool isDeterministicInScopeOfQuery() const override { return false; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } - bool useDefaultImplementationForConstants() const override { return false; } - bool useDefaultImplementationForNulls() const override { return false; } - - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override - { - if (arguments.size() > 2) - throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, expected from 0 to 2", - getName(), arguments.size()); - - - for (size_t i = 0; i != 2; ++i) - { - if (arguments.size() == i) - break; - - if (!isUnsignedInteger(arguments[i]) && !arguments[i]->onlyNull()) - { - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the {} argument of function {}, expected unsigned integer or Null", - i + 1, - arguments[i]->getName(), - getName()); - } - } - - return std::make_shared(); - } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override - { - size_t seed = randomSeed(); - size_t number_of_columns = 0; - - if (!arguments.empty() && !arguments[0].column->onlyNull()) - { - number_of_columns = arguments[0].column->getUInt(0); - if (number_of_columns > MAX_NUMBER_OF_COLUMNS) - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Maximum allowed number of columns is {}, got {}", - MAX_NUMBER_OF_COLUMNS, - number_of_columns); - } - - if (arguments.size() > 1 && !arguments[1].column->onlyNull()) - seed = arguments[1].column->getUInt(0); - - pcg64 rng(seed); - if (number_of_columns == 0) - number_of_columns = generateNumberOfColumns(rng); - - auto col_res = ColumnString::create(); - auto & string_column = assert_cast(*col_res); - auto & chars = string_column.getChars(); - WriteBufferFromVector buf(chars); - for (size_t i = 0; i != number_of_columns; ++i) - { - if (i != 0) - writeCString(", ", buf); - String column_name = "c" + std::to_string(i + 1); - writeString(column_name, buf); - writeChar(' ', buf); - writeRandomType(column_name, rng, buf); - } - - buf.finalize(); - chars.push_back(0); - string_column.getOffsets().push_back(chars.size()); - return ColumnConst::create(std::move(col_res), input_rows_count); - } - -private: - - size_t generateNumberOfColumns(pcg64 & rng) const + size_t generateNumberOfColumns(pcg64 & rng) { return rng() % MAX_NUMBER_OF_COLUMNS + 1; } - template - void writeRandomType(const String & column_name, pcg64 & rng, WriteBuffer & buf, size_t depth = 0) const + void writeLowCardinalityNestedType(pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types, bool allow_nullable = true) { + bool make_nullable = allow_nullable & rng() % 2; + if (make_nullable) + writeCString("Nullable(", buf); + + if (allow_suspicious_lc_types) + { + TypeIndex type = suspicious_lc_types[rng() % map_key_types.size()]; + if (type == TypeIndex::FixedString) + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + else + writeString(magic_enum::enum_name(type), buf); + } + else + { + /// Support only String and FixedString. + if (rng() % 2) + writeCString("String", buf); + else + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + } + + if (make_nullable) + writeChar(')', buf); + } + + + void writeEnumValues(const String & column_name, pcg64 & rng, WriteBuffer & buf, ssize_t max_value) + { + /// Don't generate big enums, because it will lead to really big result + /// and slowness of this function, and it can lead to `Max query size exceeded` + /// while using this function with generateRandom. + size_t num_values = rng() % 16 + 1; + std::vector values(num_values); + + /// Generate random numbers from range [-(max_value + 1), max_value - num_values + 1] + for (Int16 & x : values) + x = rng() % (2 * (max_value + 1) - num_values) - max_value - 1; + /// Make all numbers unique. + std::sort(values.begin(), values.end()); + for (size_t i = 0; i < num_values; ++i) + values[i] += i; + std::shuffle(values.begin(), values.end(), rng); + for (size_t i = 0; i != num_values; ++i) + { + if (i != 0) + writeCString(", ", buf); + writeString("'" + column_name + "V" + std::to_string(i) + "' = " + std::to_string(values[i]), buf); + } + } + + void writeMapKeyType(const String & column_name, pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types) + { + TypeIndex type = map_key_types[rng() % map_key_types.size()]; + switch (type) + { + case TypeIndex::FixedString: + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + break; + case TypeIndex::LowCardinality: + writeCString("LowCardinality(", buf); + writeLowCardinalityNestedType(rng, buf, allow_suspicious_lc_types, false); + writeChar(')', buf); + break; + case TypeIndex::Enum8: + writeCString("Enum8(", buf); + writeEnumValues(column_name, rng, buf, INT8_MAX); + writeChar(')', buf); + break; + case TypeIndex::Enum16: + writeCString("Enum16(", buf); + writeEnumValues(column_name, rng, buf, INT16_MAX); + writeChar(')', buf); + break; + default: + writeString(magic_enum::enum_name(type), buf); + break; + } + } + + template + void writeRandomType(const String & column_name, pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types, size_t depth = 0) + { + if (allow_complex_types && depth > MAX_DEPTH) + writeRandomType(column_name, rng, buf, depth); + constexpr auto all_types = getAllTypes(); auto type = all_types[rng() % all_types.size()]; @@ -212,55 +249,55 @@ private: writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); return; case TypeIndex::DateTime64: - writeString("DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION) + ")", buf); + writeString("DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION + 1) + ")", buf); return; case TypeIndex::Decimal32: - writeString("Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION) + ")", buf); + writeString("Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION + 1) + ")", buf); return; case TypeIndex::Decimal64: - writeString("Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION) + ")", buf); + writeString("Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION + 1) + ")", buf); return; case TypeIndex::Decimal128: - writeString("Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION) + ")", buf); + writeString("Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION + 1) + ")", buf); return; case TypeIndex::Decimal256: - writeString("Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION) + ")", buf); + writeString("Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION + 1) + ")", buf); return; case TypeIndex::Enum8: writeCString("Enum8(", buf); - writeEnumValues(column_name, rng, buf); + writeEnumValues(column_name, rng, buf, INT8_MAX); writeChar(')', buf); return; case TypeIndex::Enum16: writeCString("Enum16(", buf); - writeEnumValues(column_name, rng, buf); + writeEnumValues(column_name, rng, buf, INT16_MAX); writeChar(')', buf); return; case TypeIndex::LowCardinality: writeCString("LowCardinality(", buf); - writeLowCardinalityNestedType(rng, buf); + writeLowCardinalityNestedType(rng, buf, allow_suspicious_lc_types); writeChar(')', buf); return; case TypeIndex::Nullable: { writeCString("Nullable(", buf); - writeRandomType(column_name, rng, buf, depth + 1); + writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); writeChar(')', buf); return; } case TypeIndex::Array: { writeCString("Array(", buf); - writeRandomType(column_name, rng, buf, depth + 1); + writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); writeChar(')', buf); return; } case TypeIndex::Map: { writeCString("Map(", buf); - writeMapKeyType(rng, buf); + writeMapKeyType(column_name, rng, buf, allow_suspicious_lc_types); writeCString(", ", buf); - writeRandomType(column_name, rng, buf, depth + 1); + writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); writeChar(')', buf); return; } @@ -285,7 +322,7 @@ private: writeString(element_name, buf); writeChar(' ', buf); } - writeRandomType(element_name, rng, buf, depth + 1); + writeRandomType(element_name, rng, buf, allow_suspicious_lc_types, depth + 1); } writeChar(')', buf); return; @@ -296,64 +333,87 @@ private: } } - void writeMapKeyType(pcg64 & rng, WriteBuffer & buf) const + void writeRandomStructure(pcg64 & rng, size_t number_of_columns, WriteBuffer & buf, bool allow_suspicious_lc_types) { - TypeIndex type = map_key_types[rng() % map_key_types.size()]; - if (type == TypeIndex::FixedString) - writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); - else - writeString(magic_enum::enum_name(type), buf); - } - - void writeLowCardinalityNestedType(pcg64 & rng, WriteBuffer & buf) const - { - /// Support only String and FixedString (maybe Nullable). - String nested_type; - bool make_nullable = rng() % 2; - if (make_nullable) - writeCString("Nullable(", buf); - - if (rng() % 2) - writeCString("String", buf); - else - writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); - - if (make_nullable) - writeChar(')', buf); - } - - void writeEnumValues(const String & column_name, pcg64 & rng, WriteBuffer & buf) const - { - /// Don't generate big enums, because it will lead to really big result - /// and slowness of this function, and it can lead to `Max query size exceeded` - /// while using this function with generateRandom. - ssize_t num_values = rng() % 16 + 1; - for (ssize_t i = 0; i != num_values; ++i) + for (size_t i = 0; i != number_of_columns; ++i) { if (i != 0) writeCString(", ", buf); - writeString("'" + column_name + "V" + std::to_string(i) + "' = " + std::to_string(i), buf); + String column_name = "c" + std::to_string(i + 1); + writeString(column_name, buf); + writeChar(' ', buf); + writeRandomType(column_name, rng, buf, allow_suspicious_lc_types); + } + } +} + +DataTypePtr FunctionGenerateRandomStructure::getReturnTypeImpl(const DataTypes & arguments) const +{ + if (arguments.size() > 2) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, expected from 0 to 2", + getName(), arguments.size()); + + + for (size_t i = 0; i != arguments.size(); ++i) + { + if (!isUnsignedInteger(arguments[i]) && !arguments[i]->onlyNull()) + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the {} argument of function {}, expected unsigned integer or Null", + arguments[i]->getName(), + i + 1, + getName()); } } - template - static constexpr auto getAllTypes() + return std::make_shared(); +} + +ColumnPtr FunctionGenerateRandomStructure::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const +{ + size_t seed = randomSeed(); + size_t number_of_columns = 0; + + if (!arguments.empty() && !arguments[0].column->onlyNull()) { - constexpr size_t complex_types_size = complex_types.size() * allow_complex_types; - constexpr size_t result_size = simple_types.size() + complex_types_size; - std::array result; - size_t index = 0; - - for (size_t i = 0; i != simple_types.size(); ++i, ++index) - result[index] = simple_types[i]; - - for (size_t i = 0; i != complex_types_size; ++i, ++index) - result[index] = complex_types[i]; - - return result; + number_of_columns = arguments[0].column->getUInt(0); + if (number_of_columns > MAX_NUMBER_OF_COLUMNS) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Maximum allowed number of columns is {}, got {}", + MAX_NUMBER_OF_COLUMNS, + number_of_columns); } -}; + if (arguments.size() > 1 && !arguments[1].column->onlyNull()) + seed = arguments[1].column->getUInt(0); + + pcg64 rng(seed); + if (number_of_columns == 0) + number_of_columns = generateNumberOfColumns(rng); + + auto col_res = ColumnString::create(); + auto & string_column = assert_cast(*col_res); + auto & chars = string_column.getChars(); + WriteBufferFromVector buf(chars); + writeRandomStructure(rng, number_of_columns, buf, allow_suspicious_lc_types); + buf.finalize(); + chars.push_back(0); + string_column.getOffsets().push_back(chars.size()); + return ColumnConst::create(std::move(col_res), input_rows_count); +} + +String FunctionGenerateRandomStructure::generateRandomStructure(size_t seed, const ContextPtr & context) +{ + pcg64 rng(seed); + size_t number_of_columns = generateNumberOfColumns(rng); + WriteBufferFromOwnString buf; + writeRandomStructure(rng, number_of_columns, buf, context->getSettingsRef().allow_suspicious_low_cardinality_types); + return buf.str(); +} REGISTER_FUNCTION(GenerateRandomStructure) { diff --git a/src/Functions/FunctionGenerateRandomStructure.h b/src/Functions/FunctionGenerateRandomStructure.h new file mode 100644 index 00000000000..1d1bcb1a0a8 --- /dev/null +++ b/src/Functions/FunctionGenerateRandomStructure.h @@ -0,0 +1,45 @@ +#include +#include + +#include + +namespace DB +{ + +class FunctionGenerateRandomStructure : public IFunction +{ +public: + static constexpr auto name = "generateRandomStructure"; + + explicit FunctionGenerateRandomStructure(bool allow_suspicious_lc_types_) : allow_suspicious_lc_types(allow_suspicious_lc_types_) + { + } + + static FunctionPtr create(ContextPtr context) + { + return std::make_shared(context->getSettingsRef().allow_suspicious_low_cardinality_types.value); + } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 0; } + + bool isVariadic() const override { return true; } + bool isDeterministic() const override { return false; } + bool isDeterministicInScopeOfQuery() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } + bool useDefaultImplementationForConstants() const override { return false; } + bool useDefaultImplementationForNulls() const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override; + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override; + + static String generateRandomStructure(size_t seed, const ContextPtr & context); + +private: + bool allow_suspicious_lc_types; +}; + +} diff --git a/src/TableFunctions/TableFunctionGenerateRandom.cpp b/src/TableFunctions/TableFunctionGenerateRandom.cpp index 12cbda334a3..a9e81c78c07 100644 --- a/src/TableFunctions/TableFunctionGenerateRandom.cpp +++ b/src/TableFunctions/TableFunctionGenerateRandom.cpp @@ -1,20 +1,20 @@ -#include #include -#include #include #include #include #include -#include #include #include #include +#include #include #include +#include + #include "registerTableFunctions.h" @@ -41,13 +41,21 @@ void TableFunctionGenerateRandom::parseArguments(const ASTPtr & ast_function, Co if (args.empty()) return; - if (args.size() > 4) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Table function '{}' requires at most four arguments: " - " structure, [random_seed, max_string_length, max_array_length].", getName()); + /// First, check if first argument is structure or seed. + const auto * first_arg_literal = args[0]->as(); + bool first_argument_is_structure = !first_arg_literal || first_arg_literal->value.getType() == Field::Types::String; + size_t max_args = first_argument_is_structure ? 4 : 3; - /// Allow constant expression for structure argument, it can be generated using generateRandomStructure function. - args[0] = evaluateConstantExpressionAsLiteral(args[0], context); + if (args.size() > max_args) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Table function '{}' requires at most four (or three if structure is missing) arguments: " + " [structure, random_seed, max_string_length, max_array_length].", getName()); + + if (first_argument_is_structure) + { + /// Allow constant expression for structure argument, it can be generated using generateRandomStructure function. + args[0] = evaluateConstantExpressionAsLiteral(args[0], context); + } // All the arguments must be literals. for (const auto & arg : args) @@ -55,26 +63,39 @@ void TableFunctionGenerateRandom::parseArguments(const ASTPtr & ast_function, Co if (!arg->as()) { throw Exception(ErrorCodes::BAD_ARGUMENTS, - "All arguments of table function '{}' must be literals. " + "All arguments of table function '{}' except structure argument must be literals. " "Got '{}' instead", getName(), arg->formatForErrorMessage()); } } - /// Parsing first argument as table structure and creating a sample block - structure = checkAndGetLiteralArgument(args[0], "structure"); + size_t arg_index = 0; - if (args.size() >= 2) + if (first_argument_is_structure) { - const auto & literal = args[1]->as(); + /// Parsing first argument as table structure and creating a sample block + structure = checkAndGetLiteralArgument(args[arg_index], "structure"); + ++arg_index; + } + + if (args.size() >= arg_index + 1) + { + const auto & literal = args[arg_index]->as(); + ++arg_index; if (!literal.value.isNull()) random_seed = checkAndGetLiteralArgument(literal, "random_seed"); } - if (args.size() >= 3) - max_string_length = checkAndGetLiteralArgument(args[2], "max_string_length"); + if (args.size() >= arg_index + 1) + { + max_string_length = checkAndGetLiteralArgument(args[arg_index], "max_string_length"); + ++arg_index; + } - if (args.size() == 4) - max_array_length = checkAndGetLiteralArgument(args[3], "max_string_length"); + if (args.size() == arg_index + 1) + { + max_array_length = checkAndGetLiteralArgument(args[arg_index], "max_string_length"); + ++arg_index; + } } ColumnsDescription TableFunctionGenerateRandom::getActualTableStructure(ContextPtr context) const @@ -82,11 +103,11 @@ ColumnsDescription TableFunctionGenerateRandom::getActualTableStructure(ContextP if (structure == "auto") { if (structure_hint.empty()) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Table function '{}' was used without structure argument but structure could not be determined automatically. Please, " - "provide structure manually", - getName()); + { + auto random_structure = FunctionGenerateRandomStructure::generateRandomStructure(random_seed.value_or(randomSeed()), context); + return parseColumnsListFromString(random_structure, context); + } + return structure_hint; } diff --git a/tests/queries/0_stateless/02586_generate_random_structure.reference b/tests/queries/0_stateless/02586_generate_random_structure.reference index 65bdc530f10..bd2009830f9 100644 --- a/tests/queries/0_stateless/02586_generate_random_structure.reference +++ b/tests/queries/0_stateless/02586_generate_random_structure.reference @@ -1,4 +1,16 @@ -c1 String, c2 UInt256, c3 String, c4 Decimal128(7), c5 UInt128 +c1 String, c2 UInt256, c3 String, c4 Decimal128(8), c5 UInt128 String Const(String) -` 90465455320735604871982424534384518837533904778028808627865442405232847164685 5& -3034771008825448884614719061068.2821046 75820566154622566322847299106656624693 +` 90465455320735604871982424534384518837533904778028808627865442405232847164685 5& -303477100882544888461471906106.82821046 75820566154622566322847299106656624693 +c1 Int128 +c2 Decimal(76, 55) +c3 Int256 +c4 UInt32 +c5 UInt256 +c6 Float64 +c7 Map(DateTime, Int128) +c8 IPv6 +c9 Date32 +-77422512305044606600216318673365695785 -178276798036269625488.0220515892112491429818466547307797481494678716313923193 36368120999598076422295038178490120194013353260138466872851513353522727275230 2299358810 12701207658267404852859640589581641341858007661085134086312689265075880787713 -9.78063876538428e-227 {'2063-09-16 00:40:36':127575633389498667752072479581409788016,'2052-11-08 23:07:13':-53938349319031918752329205601430421582,'2104-07-17 19:59:35':84394139582225600743319968813775553298,'2034-11-07 21:10:22':151550220355687100498925996413330909596} 328a:eccb:530f:23c3:275d:7eec:2b1b:9c29 2112-05-13 +-77422512305044606600216318673365695785 -178276798036269625488.0220515892112491429818466547307797481494678716313923193 36368120999598076422295038178490120194013353260138466872851513353522727275230 2299358810 12701207658267404852859640589581641341858007661085134086312689265075880787713 -9.78063876538428e-227 {'2063-09-16 00:40:36':127575633389498667752072479581409788016,'2052-11-08 23:07:13':-53938349319031918752329205601430421582,'2104-07-17 19:59:35':84394139582225600743319968813775553298,'2034-11-07 21:10:22':151550220355687100498925996413330909596} 328a:eccb:530f:23c3:275d:7eec:2b1b:9c29 2112-05-13 +-77422512305044606600216318673365695785 -178276798036269625488.0220515892112491429818466547307797481494678716313923193 36368120999598076422295038178490120194013353260138466872851513353522727275230 2299358810 12701207658267404852859640589581641341858007661085134086312689265075880787713 -9.78063876538428e-227 {'2063-09-16 00:40:36':166979754159728572703419507823025932071} 8eff:8d3c:5a2c:fa5f:b2bf:2b0e:ff23:beb2 2143-03-03 diff --git a/tests/queries/0_stateless/02586_generate_random_structure.sql b/tests/queries/0_stateless/02586_generate_random_structure.sql index a28c159cff5..174e2be261e 100644 --- a/tests/queries/0_stateless/02586_generate_random_structure.sql +++ b/tests/queries/0_stateless/02586_generate_random_structure.sql @@ -9,3 +9,9 @@ select generateRandomStructure(5, '42'); -- {serverError ILLEGAL_TYPE_OF_ARGUMEN select generateRandomStructure(materialize(5), 42); -- {serverError ILLEGAL_COLUMN} select generateRandomStructure(5, materialize(42)); -- {serverError ILLEGAL_COLUMN} +desc generateRandom(11); +select * from generateRandom(11) limit 1; +select * from generateRandom(11, 2) limit 1; +select * from generateRandom(11, 2, 2) limit 1; +select * from generateRandom(11, 2, 2, 2) limit 1; -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} + From aa7ab1f23badfc798116e7eeece8e62a0dfcfa9b Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 15 May 2023 11:20:03 +0000 Subject: [PATCH 0076/1072] Fix comments --- .../functions/other-functions.md | 2 ++ .../FunctionGenerateRandomStructure.cpp | 25 +++++++++++-------- .../02586_generate_random_structure.reference | 1 + .../02586_generate_random_structure.sql | 3 +++ 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index e235a3db393..20e1168ed5a 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2677,5 +2677,7 @@ Result: └────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` +**Note**: the maximum nesting depth of complex types (Array, Tuple, Map, Nested) is limited to 16. + This function can be used together with [generateRandom](../../sql-reference/table-functions/generate.md) to generate completely random tables. diff --git a/src/Functions/FunctionGenerateRandomStructure.cpp b/src/Functions/FunctionGenerateRandomStructure.cpp index 4cf783212cb..75455cdda85 100644 --- a/src/Functions/FunctionGenerateRandomStructure.cpp +++ b/src/Functions/FunctionGenerateRandomStructure.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -29,7 +30,7 @@ namespace const size_t MAX_DECIMAL64_PRECISION = 18; const size_t MAX_DECIMAL128_PRECISION = 38; const size_t MAX_DECIMAL256_PRECISION = 76; - const size_t MAX_DEPTH = 32; + const size_t MAX_DEPTH = 16; constexpr std::array simple_types { @@ -147,15 +148,16 @@ namespace return rng() % MAX_NUMBER_OF_COLUMNS + 1; } - void writeLowCardinalityNestedType(pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types, bool allow_nullable = true) + void writeLowCardinalityNestedType(pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types) { - bool make_nullable = allow_nullable & rng() % 2; + bool make_nullable = rng() % 2; if (make_nullable) writeCString("Nullable(", buf); if (allow_suspicious_lc_types) { - TypeIndex type = suspicious_lc_types[rng() % map_key_types.size()]; + TypeIndex type = suspicious_lc_types[rng() % suspicious_lc_types.size()]; + if (type == TypeIndex::FixedString) writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); else @@ -174,7 +176,6 @@ namespace writeChar(')', buf); } - void writeEnumValues(const String & column_name, pcg64 & rng, WriteBuffer & buf, ssize_t max_value) { /// Don't generate big enums, because it will lead to really big result @@ -183,9 +184,9 @@ namespace size_t num_values = rng() % 16 + 1; std::vector values(num_values); - /// Generate random numbers from range [-(max_value + 1), max_value - num_values + 1] + /// Generate random numbers from range [-(max_value + 1), max_value - num_values + 1]. for (Int16 & x : values) - x = rng() % (2 * (max_value + 1) - num_values) - max_value - 1; + x = rng() % (2 * max_value + 3 - num_values) - max_value - 1; /// Make all numbers unique. std::sort(values.begin(), values.end()); for (size_t i = 0; i < num_values; ++i) @@ -199,7 +200,7 @@ namespace } } - void writeMapKeyType(const String & column_name, pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types) + void writeMapKeyType(const String & column_name, pcg64 & rng, WriteBuffer & buf) { TypeIndex type = map_key_types[rng() % map_key_types.size()]; switch (type) @@ -209,7 +210,11 @@ namespace break; case TypeIndex::LowCardinality: writeCString("LowCardinality(", buf); - writeLowCardinalityNestedType(rng, buf, allow_suspicious_lc_types, false); + /// Map key supports only String and FixedString inside LowCardinality. + if (rng() % 2) + writeCString("String", buf); + else + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); writeChar(')', buf); break; case TypeIndex::Enum8: @@ -295,7 +300,7 @@ namespace case TypeIndex::Map: { writeCString("Map(", buf); - writeMapKeyType(column_name, rng, buf, allow_suspicious_lc_types); + writeMapKeyType(column_name, rng, buf); writeCString(", ", buf); writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); writeChar(')', buf); diff --git a/tests/queries/0_stateless/02586_generate_random_structure.reference b/tests/queries/0_stateless/02586_generate_random_structure.reference index bd2009830f9..e6e2c73ad87 100644 --- a/tests/queries/0_stateless/02586_generate_random_structure.reference +++ b/tests/queries/0_stateless/02586_generate_random_structure.reference @@ -14,3 +14,4 @@ c9 Date32 -77422512305044606600216318673365695785 -178276798036269625488.0220515892112491429818466547307797481494678716313923193 36368120999598076422295038178490120194013353260138466872851513353522727275230 2299358810 12701207658267404852859640589581641341858007661085134086312689265075880787713 -9.78063876538428e-227 {'2063-09-16 00:40:36':127575633389498667752072479581409788016,'2052-11-08 23:07:13':-53938349319031918752329205601430421582,'2104-07-17 19:59:35':84394139582225600743319968813775553298,'2034-11-07 21:10:22':151550220355687100498925996413330909596} 328a:eccb:530f:23c3:275d:7eec:2b1b:9c29 2112-05-13 -77422512305044606600216318673365695785 -178276798036269625488.0220515892112491429818466547307797481494678716313923193 36368120999598076422295038178490120194013353260138466872851513353522727275230 2299358810 12701207658267404852859640589581641341858007661085134086312689265075880787713 -9.78063876538428e-227 {'2063-09-16 00:40:36':127575633389498667752072479581409788016,'2052-11-08 23:07:13':-53938349319031918752329205601430421582,'2104-07-17 19:59:35':84394139582225600743319968813775553298,'2034-11-07 21:10:22':151550220355687100498925996413330909596} 328a:eccb:530f:23c3:275d:7eec:2b1b:9c29 2112-05-13 -77422512305044606600216318673365695785 -178276798036269625488.0220515892112491429818466547307797481494678716313923193 36368120999598076422295038178490120194013353260138466872851513353522727275230 2299358810 12701207658267404852859640589581641341858007661085134086312689265075880787713 -9.78063876538428e-227 {'2063-09-16 00:40:36':166979754159728572703419507823025932071} 8eff:8d3c:5a2c:fa5f:b2bf:2b0e:ff23:beb2 2143-03-03 +c1 LowCardinality(Nullable(UInt64)), c2 Date32, c3 LowCardinality(Nullable(Float64)), c4 Int256, c5 Date32 diff --git a/tests/queries/0_stateless/02586_generate_random_structure.sql b/tests/queries/0_stateless/02586_generate_random_structure.sql index 174e2be261e..b9cec1a436a 100644 --- a/tests/queries/0_stateless/02586_generate_random_structure.sql +++ b/tests/queries/0_stateless/02586_generate_random_structure.sql @@ -15,3 +15,6 @@ select * from generateRandom(11, 2) limit 1; select * from generateRandom(11, 2, 2) limit 1; select * from generateRandom(11, 2, 2, 2) limit 1; -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +set allow_suspicious_low_cardinality_types=1; +select generateRandomStructure(5, 4); + From eef0a433e57598405009cf3f859da767f7415972 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 15 May 2023 11:24:18 +0000 Subject: [PATCH 0077/1072] Add note about possible huge output --- docs/en/sql-reference/table-functions/generate.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/en/sql-reference/table-functions/generate.md b/docs/en/sql-reference/table-functions/generate.md index 6ceeb63cbb3..724f6d4a1f2 100644 --- a/docs/en/sql-reference/table-functions/generate.md +++ b/docs/en/sql-reference/table-functions/generate.md @@ -95,5 +95,7 @@ SELECT * FROM generateRandom(11) LIMIT 3; └──────────────────────────────────────────┴────────────────────────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────────────────────────┴────────────┴────────────────────────────────────────────────────────────────────────────────┴─────────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────┴────────────┘ ``` +**Note:** `generateRandom(generateRandomStructure(), [random seed], max_string_length, max_array_length)` with large enough `max_array_length` can generate really huge output due to possible big nesting depth (up to 16) of complex types (`Array`, `Tuple`, `Map`, `Nested`). + ## Related content - Blog: [Generating random data in ClickHouse](https://clickhouse.com/blog/generating-random-test-distribution-data-for-clickhouse) From c901d2a7be1f1791b75567f381bfddb7416c4beb Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 15 May 2023 13:46:18 +0200 Subject: [PATCH 0078/1072] Fix style --- src/Functions/FunctionGenerateRandomStructure.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Functions/FunctionGenerateRandomStructure.h b/src/Functions/FunctionGenerateRandomStructure.h index 1d1bcb1a0a8..894096a6e07 100644 --- a/src/Functions/FunctionGenerateRandomStructure.h +++ b/src/Functions/FunctionGenerateRandomStructure.h @@ -1,3 +1,5 @@ +#pragma once + #include #include From a07db551a7f0233b74d47555b98ca46cffb8b2a3 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 15 May 2023 13:46:36 +0200 Subject: [PATCH 0079/1072] Fix style --- src/TableFunctions/TableFunctionGenerateRandom.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/TableFunctions/TableFunctionGenerateRandom.cpp b/src/TableFunctions/TableFunctionGenerateRandom.cpp index a9e81c78c07..08059796660 100644 --- a/src/TableFunctions/TableFunctionGenerateRandom.cpp +++ b/src/TableFunctions/TableFunctionGenerateRandom.cpp @@ -26,7 +26,6 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int LOGICAL_ERROR; - extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } void TableFunctionGenerateRandom::parseArguments(const ASTPtr & ast_function, ContextPtr context) From 362fa4849f0beccc988231ba3b107ca0868ccb16 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 15 May 2023 17:56:53 +0200 Subject: [PATCH 0080/1072] Try to fix build --- src/Functions/FunctionGenerateRandomStructure.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Functions/FunctionGenerateRandomStructure.cpp b/src/Functions/FunctionGenerateRandomStructure.cpp index 75455cdda85..9d818350fac 100644 --- a/src/Functions/FunctionGenerateRandomStructure.cpp +++ b/src/Functions/FunctionGenerateRandomStructure.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include @@ -431,12 +431,12 @@ the number of columns in the result structure (random by default) and random see The maximum number of columns is 128. The function returns a value of type String. )", - Documentation::Examples{ + FunctionDocumentation::Examples{ {"random", "SELECT generateRandomStructure()"}, {"with specified number of columns", "SELECT generateRandomStructure(10)"}, {"with specified seed", "SELECT generateRandomStructure(10, 42)"}, }, - Documentation::Categories{"Random"} + FunctionDocumentation::Categories{"Random"} }, FunctionFactory::CaseSensitive); } From d9be88a36a0a9221345e8300e954440d15605b8b Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Mon, 15 May 2023 21:40:10 +0000 Subject: [PATCH 0081/1072] Add UUID data type to PostgreSQL --- src/Storages/StoragePostgreSQL.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/StoragePostgreSQL.cpp b/src/Storages/StoragePostgreSQL.cpp index e013199c584..09198e5bdad 100644 --- a/src/Storages/StoragePostgreSQL.cpp +++ b/src/Storages/StoragePostgreSQL.cpp @@ -267,6 +267,7 @@ public: else if (which.isFloat64()) nested_column = ColumnFloat64::create(); else if (which.isDate()) nested_column = ColumnUInt16::create(); else if (which.isDateTime()) nested_column = ColumnUInt32::create(); + else if (which.isUUID()) nested_column = ColumnUUID::create(); else if (which.isDateTime64()) { nested_column = ColumnDecimal::create(0, 6); From b6d2a84e830dbcb74343c70e32d5de06ddea3c70 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 16 May 2023 12:01:55 +0200 Subject: [PATCH 0082/1072] Try to fix build --- src/Functions/FunctionGenerateRandomStructure.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Functions/FunctionGenerateRandomStructure.cpp b/src/Functions/FunctionGenerateRandomStructure.cpp index 9d818350fac..4cbbdd84c4a 100644 --- a/src/Functions/FunctionGenerateRandomStructure.cpp +++ b/src/Functions/FunctionGenerateRandomStructure.cpp @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include From d50e6fe8682a7a77572dfb3ea11541fecad25702 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 16 May 2023 15:35:16 +0200 Subject: [PATCH 0083/1072] Fix build after bad conflicts resolution --- src/Functions/FunctionGenerateRandomStructure.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Functions/FunctionGenerateRandomStructure.cpp b/src/Functions/FunctionGenerateRandomStructure.cpp index 4cbbdd84c4a..16dac4f5112 100644 --- a/src/Functions/FunctionGenerateRandomStructure.cpp +++ b/src/Functions/FunctionGenerateRandomStructure.cpp @@ -426,19 +426,19 @@ REGISTER_FUNCTION(GenerateRandomStructure) { factory.registerFunction( { - R"( + .description=R"( Generates a random table structure. This function takes 2 optional constant arguments: the number of columns in the result structure (random by default) and random seed (random by default) The maximum number of columns is 128. The function returns a value of type String. )", - FunctionDocumentation::Examples{ - {"random", "SELECT generateRandomStructure()"}, - {"with specified number of columns", "SELECT generateRandomStructure(10)"}, - {"with specified seed", "SELECT generateRandomStructure(10, 42)"}, + .examples{ + {"random", "SELECT generateRandomStructure()", "c1 UInt32, c2 FixedString(25)"}, + {"with specified number of columns", "SELECT generateRandomStructure(3)", "c1 String, c2 Array(Int32), c3 LowCardinality(String)"}, + {"with specified seed", "SELECT generateRandomStructure(1, 42)", "c1 UInt128"}, }, - FunctionDocumentation::Categories{"Random"} + .categories{"Random"} }, FunctionFactory::CaseSensitive); } From 846804fed085680d4d4ae1ac5f34329e39006486 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 17 May 2023 11:39:04 +0000 Subject: [PATCH 0084/1072] Add separate handshake_timeout for receiving Hello packet from replica --- docs/en/operations/settings/settings.md | 6 +++ src/Client/Connection.cpp | 6 ++- src/Client/Connection.h | 2 +- src/Client/ConnectionParameters.cpp | 3 +- src/Core/Settings.h | 1 + src/IO/ConnectionTimeouts.cpp | 58 +++++++++++++++---------- src/IO/ConnectionTimeouts.h | 14 ++++-- src/Server/TCPHandler.cpp | 7 +++ 8 files changed, 65 insertions(+), 32 deletions(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index cddde2090f8..20779eba360 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1048,6 +1048,12 @@ Timeouts in seconds on the socket used for communicating with the client. Default value: 10, 300, 300. +## handshake_timeout_ms {#handshake-timeout-ms} + +Timeout in milliseconds for receiving Hello packet from replicas during handshake. + +Default value: 300000. + ## cancel_http_readonly_queries_on_client_close {#cancel-http-readonly-queries-on-client-close} Cancels HTTP read-only queries (e.g. SELECT) when a client closes the connection without waiting for the response. diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index d39148d3016..09145bcdf1b 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -190,7 +190,7 @@ void Connection::connect(const ConnectionTimeouts & timeouts) connected = true; sendHello(); - receiveHello(); + receiveHello(timeouts.handshake_timeout); if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_ADDENDUM) sendAddendum(); @@ -305,8 +305,10 @@ void Connection::sendAddendum() } -void Connection::receiveHello() +void Connection::receiveHello(const Poco::Timespan & handshake_timeout) { + TimeoutSetter timeout_setter(*socket, socket->getSendTimeout(), handshake_timeout); + /// Receive hello packet. UInt64 packet_type = 0; diff --git a/src/Client/Connection.h b/src/Client/Connection.h index 77dbe5e3398..cb3f2507cb9 100644 --- a/src/Client/Connection.h +++ b/src/Client/Connection.h @@ -256,7 +256,7 @@ private: void connect(const ConnectionTimeouts & timeouts); void sendHello(); void sendAddendum(); - void receiveHello(); + void receiveHello(const Poco::Timespan & handshake_timeout); #if USE_SSL void sendClusterNameAndSalt(); diff --git a/src/Client/ConnectionParameters.cpp b/src/Client/ConnectionParameters.cpp index 2031036eb58..c47d217d432 100644 --- a/src/Client/ConnectionParameters.cpp +++ b/src/Client/ConnectionParameters.cpp @@ -67,7 +67,8 @@ ConnectionParameters::ConnectionParameters(const Poco::Util::AbstractConfigurati Poco::Timespan(config.getInt("connect_timeout", DBMS_DEFAULT_CONNECT_TIMEOUT_SEC), 0), Poco::Timespan(config.getInt("send_timeout", DBMS_DEFAULT_SEND_TIMEOUT_SEC), 0), Poco::Timespan(config.getInt("receive_timeout", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC), 0), - Poco::Timespan(config.getInt("tcp_keep_alive_timeout", 0), 0)); + Poco::Timespan(config.getInt("tcp_keep_alive_timeout", 0), 0), + Poco::Timespan(config.getInt("handshake_timeout_ms", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC * 1000), 0)); timeouts.sync_request_timeout = Poco::Timespan(config.getInt("sync_request_timeout", DBMS_DEFAULT_SYNC_REQUEST_TIMEOUT_SEC), 0); } diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 610c7135a75..d5fb864ca6b 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -55,6 +55,7 @@ class IColumn; M(UInt64, max_query_size, DBMS_DEFAULT_MAX_QUERY_SIZE, "The maximum number of bytes of a query string parsed by the SQL parser. Data in the VALUES clause of INSERT queries is processed by a separate stream parser (that consumes O(1) RAM) and not affected by this restriction.", 0) \ M(UInt64, interactive_delay, 100000, "The interval in microseconds to check if the request is cancelled, and to send progress info.", 0) \ M(Seconds, connect_timeout, DBMS_DEFAULT_CONNECT_TIMEOUT_SEC, "Connection timeout if there are no replicas.", 0) \ + M(Milliseconds, handshake_timeout_ms, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC * 1000, "Timeout for receiving HELLO packet from replicas.", 0) \ M(Milliseconds, connect_timeout_with_failover_ms, 1000, "Connection timeout for selecting first healthy replica.", 0) \ M(Milliseconds, connect_timeout_with_failover_secure_ms, 1000, "Connection timeout for selecting first healthy replica (for secure connections).", 0) \ M(Seconds, receive_timeout, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, "Timeout for receiving data from network, in seconds. If no bytes were received in this interval, exception is thrown. If you set this setting on client, the 'send_timeout' for the socket will be also set on the corresponding connection end on the server.", 0) \ diff --git a/src/IO/ConnectionTimeouts.cpp b/src/IO/ConnectionTimeouts.cpp index 401afb7baac..01fbaa4f817 100644 --- a/src/IO/ConnectionTimeouts.cpp +++ b/src/IO/ConnectionTimeouts.cpp @@ -17,22 +17,7 @@ ConnectionTimeouts::ConnectionTimeouts( , secure_connection_timeout(connection_timeout) , hedged_connection_timeout(receive_timeout_) , receive_data_timeout(receive_timeout_) -{ -} - -ConnectionTimeouts::ConnectionTimeouts( - Poco::Timespan connection_timeout_, - Poco::Timespan send_timeout_, - Poco::Timespan receive_timeout_, - Poco::Timespan tcp_keep_alive_timeout_) - : connection_timeout(connection_timeout_) - , send_timeout(send_timeout_) - , receive_timeout(receive_timeout_) - , tcp_keep_alive_timeout(tcp_keep_alive_timeout_) - , http_keep_alive_timeout(0) - , secure_connection_timeout(connection_timeout) - , hedged_connection_timeout(receive_timeout_) - , receive_data_timeout(receive_timeout_) + , handshake_timeout(receive_timeout_) { } @@ -41,7 +26,26 @@ ConnectionTimeouts::ConnectionTimeouts( Poco::Timespan send_timeout_, Poco::Timespan receive_timeout_, Poco::Timespan tcp_keep_alive_timeout_, - Poco::Timespan http_keep_alive_timeout_) + Poco::Timespan handshake_timeout_) + : connection_timeout(connection_timeout_) + , send_timeout(send_timeout_) + , receive_timeout(receive_timeout_) + , tcp_keep_alive_timeout(tcp_keep_alive_timeout_) + , http_keep_alive_timeout(0) + , secure_connection_timeout(connection_timeout) + , hedged_connection_timeout(receive_timeout_) + , receive_data_timeout(receive_timeout_) + , handshake_timeout(handshake_timeout_) +{ +} + +ConnectionTimeouts::ConnectionTimeouts( + Poco::Timespan connection_timeout_, + Poco::Timespan send_timeout_, + Poco::Timespan receive_timeout_, + Poco::Timespan tcp_keep_alive_timeout_, + Poco::Timespan http_keep_alive_timeout_, + Poco::Timespan handshake_timeout_) : connection_timeout(connection_timeout_) , send_timeout(send_timeout_) , receive_timeout(receive_timeout_) @@ -50,6 +54,7 @@ ConnectionTimeouts::ConnectionTimeouts( , secure_connection_timeout(connection_timeout) , hedged_connection_timeout(receive_timeout_) , receive_data_timeout(receive_timeout_) + , handshake_timeout(handshake_timeout_) { } @@ -60,16 +65,18 @@ ConnectionTimeouts::ConnectionTimeouts( Poco::Timespan tcp_keep_alive_timeout_, Poco::Timespan http_keep_alive_timeout_, Poco::Timespan secure_connection_timeout_, - Poco::Timespan receive_hello_timeout_, - Poco::Timespan receive_data_timeout_) + Poco::Timespan hedged_connection_timeout_, + Poco::Timespan receive_data_timeout_, + Poco::Timespan handshake_timeout_) : connection_timeout(connection_timeout_) , send_timeout(send_timeout_) , receive_timeout(receive_timeout_) , tcp_keep_alive_timeout(tcp_keep_alive_timeout_) , http_keep_alive_timeout(http_keep_alive_timeout_) , secure_connection_timeout(secure_connection_timeout_) - , hedged_connection_timeout(receive_hello_timeout_) + , hedged_connection_timeout(hedged_connection_timeout_) , receive_data_timeout(receive_data_timeout_) + , handshake_timeout(handshake_timeout_) { } @@ -90,13 +97,14 @@ ConnectionTimeouts ConnectionTimeouts::getSaturated(Poco::Timespan limit) const saturate(http_keep_alive_timeout, limit), saturate(secure_connection_timeout, limit), saturate(hedged_connection_timeout, limit), - saturate(receive_data_timeout, limit)); + saturate(receive_data_timeout, limit), + saturate(handshake_timeout, limit)); } /// Timeouts for the case when we have just single attempt to connect. ConnectionTimeouts ConnectionTimeouts::getTCPTimeoutsWithoutFailover(const Settings & settings) { - return ConnectionTimeouts(settings.connect_timeout, settings.send_timeout, settings.receive_timeout, settings.tcp_keep_alive_timeout); + return ConnectionTimeouts(settings.connect_timeout, settings.send_timeout, settings.receive_timeout, settings.tcp_keep_alive_timeout, settings.handshake_timeout_ms); } /// Timeouts for the case when we will try many addresses in a loop. @@ -110,7 +118,8 @@ ConnectionTimeouts ConnectionTimeouts::getTCPTimeoutsWithFailover(const Settings 0, settings.connect_timeout_with_failover_secure_ms, settings.hedged_connection_timeout_ms, - settings.receive_data_timeout_ms); + settings.receive_data_timeout_ms, + settings.handshake_timeout_ms); } ConnectionTimeouts ConnectionTimeouts::getHTTPTimeouts(const Settings & settings, Poco::Timespan http_keep_alive_timeout) @@ -120,7 +129,8 @@ ConnectionTimeouts ConnectionTimeouts::getHTTPTimeouts(const Settings & settings settings.http_send_timeout, settings.http_receive_timeout, settings.tcp_keep_alive_timeout, - http_keep_alive_timeout); + http_keep_alive_timeout, + settings.http_receive_timeout); } } diff --git a/src/IO/ConnectionTimeouts.h b/src/IO/ConnectionTimeouts.h index 368288ee022..684af42827f 100644 --- a/src/IO/ConnectionTimeouts.h +++ b/src/IO/ConnectionTimeouts.h @@ -23,6 +23,9 @@ struct ConnectionTimeouts Poco::Timespan hedged_connection_timeout; Poco::Timespan receive_data_timeout; + /// Timeout for receiving HELLO packet + Poco::Timespan handshake_timeout; + /// Timeout for synchronous request-result protocol call (like Ping or TablesStatus) Poco::Timespan sync_request_timeout = Poco::Timespan(DBMS_DEFAULT_SYNC_REQUEST_TIMEOUT_SEC, 0); @@ -35,13 +38,15 @@ struct ConnectionTimeouts ConnectionTimeouts(Poco::Timespan connection_timeout_, Poco::Timespan send_timeout_, Poco::Timespan receive_timeout_, - Poco::Timespan tcp_keep_alive_timeout_); + Poco::Timespan tcp_keep_alive_timeout_, + Poco::Timespan handshake_timeout_); ConnectionTimeouts(Poco::Timespan connection_timeout_, Poco::Timespan send_timeout_, Poco::Timespan receive_timeout_, Poco::Timespan tcp_keep_alive_timeout_, - Poco::Timespan http_keep_alive_timeout_); + Poco::Timespan http_keep_alive_timeout_, + Poco::Timespan handshake_timeout_); ConnectionTimeouts(Poco::Timespan connection_timeout_, Poco::Timespan send_timeout_, @@ -49,8 +54,9 @@ struct ConnectionTimeouts Poco::Timespan tcp_keep_alive_timeout_, Poco::Timespan http_keep_alive_timeout_, Poco::Timespan secure_connection_timeout_, - Poco::Timespan receive_hello_timeout_, - Poco::Timespan receive_data_timeout_); + Poco::Timespan hedged_connection_timeout_, + Poco::Timespan receive_data_timeout_, + Poco::Timespan handshake_timeout_); static Poco::Timespan saturate(Poco::Timespan timespan, Poco::Timespan limit); ConnectionTimeouts getSaturated(Poco::Timespan limit) const; diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 0522b6d8a48..a076e248a9f 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -1296,6 +1296,13 @@ void TCPHandler::receiveUnexpectedHello() void TCPHandler::sendHello() { + if (unlikely(sleep_in_send_tables_status.totalMilliseconds())) + { + out->next(); + std::chrono::milliseconds ms(sleep_in_send_tables_status.totalMilliseconds()); + std::this_thread::sleep_for(ms); + } + writeVarUInt(Protocol::Server::Hello, *out); writeStringBinary(DBMS_NAME, *out); writeVarUInt(DBMS_VERSION_MAJOR, *out); From 2ff3c8badd5a5c18f14ca76438978a415fe73d74 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 17 May 2023 11:41:00 +0000 Subject: [PATCH 0085/1072] Remove testing code --- src/Server/TCPHandler.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index a076e248a9f..0522b6d8a48 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -1296,13 +1296,6 @@ void TCPHandler::receiveUnexpectedHello() void TCPHandler::sendHello() { - if (unlikely(sleep_in_send_tables_status.totalMilliseconds())) - { - out->next(); - std::chrono::milliseconds ms(sleep_in_send_tables_status.totalMilliseconds()); - std::this_thread::sleep_for(ms); - } - writeVarUInt(Protocol::Server::Hello, *out); writeStringBinary(DBMS_NAME, *out); writeVarUInt(DBMS_VERSION_MAJOR, *out); From 194ce2d881aa6c3598f24e93cce29671ec9f67c3 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Wed, 17 May 2023 13:13:57 +0000 Subject: [PATCH 0086/1072] Better --- src/Core/PostgreSQL/insertPostgreSQLValue.cpp | 2 + src/Storages/StoragePostgreSQL.cpp | 40 ++++++++++++++----- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/src/Core/PostgreSQL/insertPostgreSQLValue.cpp b/src/Core/PostgreSQL/insertPostgreSQLValue.cpp index 08b67a470f1..d2e8071c5de 100644 --- a/src/Core/PostgreSQL/insertPostgreSQLValue.cpp +++ b/src/Core/PostgreSQL/insertPostgreSQLValue.cpp @@ -202,6 +202,8 @@ void preparePostgreSQLArrayInfo( parser = [](std::string & field) -> Field { return pqxx::from_string(field); }; else if (which.isFloat64()) parser = [](std::string & field) -> Field { return pqxx::from_string(field); }; + else if (which.isUUID()) + parser = [](std::string & field) -> Field { return parse(field); }; else if (which.isString() || which.isFixedString()) parser = [](std::string & field) -> Field { return field; }; else if (which.isDate()) diff --git a/src/Storages/StoragePostgreSQL.cpp b/src/Storages/StoragePostgreSQL.cpp index 09198e5bdad..007f83165fd 100644 --- a/src/Storages/StoragePostgreSQL.cpp +++ b/src/Storages/StoragePostgreSQL.cpp @@ -197,7 +197,7 @@ public: /// Cannot just use serializeAsText for array data type even though it converts perfectly /// any dimension number array into text format, because it encloses in '[]' and for postgres it must be '{}'. /// Check if array[...] syntax from PostgreSQL will be applicable. - void parseArray(const Field & array_field, const DataTypePtr & data_type, WriteBuffer & ostr) + static void parseArray(const Field & array_field, const DataTypePtr & data_type, WriteBuffer & ostr) { const auto * array_type = typeid_cast(data_type.get()); const auto & nested = array_type->getNestedType(); @@ -205,7 +205,7 @@ public: if (!isArray(nested)) { - writeText(clickhouseToPostgresArray(array, data_type), ostr); + parseArrayContent(array, data_type, ostr); return; } @@ -219,7 +219,7 @@ public: if (!isArray(nested_array_type->getNestedType())) { - writeText(clickhouseToPostgresArray(iter->get(), nested), ostr); + parseArrayContent(iter->get(), nested, ostr); } else { @@ -232,17 +232,35 @@ public: /// Conversion is done via column casting because with writeText(Array..) got incorrect conversion /// of Date and DateTime data types and it added extra quotes for values inside array. - static std::string clickhouseToPostgresArray(const Array & array_field, const DataTypePtr & data_type) + static void parseArrayContent(const Array & array_field, const DataTypePtr & data_type, WriteBuffer & ostr) { - auto nested = typeid_cast(data_type.get())->getNestedType(); - auto array_column = ColumnArray::create(createNested(nested)); + auto nested_type = typeid_cast(data_type.get())->getNestedType(); + auto array_column = ColumnArray::create(createNested(nested_type)); array_column->insert(array_field); - WriteBufferFromOwnString ostr; - data_type->getDefaultSerialization()->serializeText(*array_column, 0, ostr, FormatSettings{}); - /// ostr is guaranteed to be at least '[]', i.e. size is at least 2 and 2 only if ostr.str() == '[]' - assert(ostr.str().size() >= 2); - return '{' + std::string(ostr.str().begin() + 1, ostr.str().end() - 1) + '}'; + const IColumn & nested_column = array_column->getData(); + const auto serialization = nested_type->getDefaultSerialization(); + + FormatSettings settings; + settings.pretty.charset = FormatSettings::Pretty::Charset::ASCII; + + if (nested_type->isNullable()) + nested_type = static_cast(nested_type.get())->getNestedType(); + + const bool quoted = !isUUID(nested_type); + + writeChar('{', ostr); + for (size_t i = 0, size = array_field.size(); i < size; ++i) + { + if (i != 0) + writeChar(',', ostr); + + if (quoted) + serialization->serializeTextQuoted(nested_column, i, ostr, settings); + else + serialization->serializeText(nested_column, i, ostr, settings); + } + writeChar('}', ostr); } static MutableColumnPtr createNested(DataTypePtr nested) From ad637c682418caa8fdffd28795a2edb415f2bfce Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Wed, 17 May 2023 13:14:10 +0000 Subject: [PATCH 0087/1072] Add test --- .../test_storage_postgresql/test.py | 41 +++++++++++-------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/tests/integration/test_storage_postgresql/test.py b/tests/integration/test_storage_postgresql/test.py index 3b7aae1ccdc..6ceddfe831b 100644 --- a/tests/integration/test_storage_postgresql/test.py +++ b/tests/integration/test_storage_postgresql/test.py @@ -113,7 +113,9 @@ def test_postgres_conversions(started_cluster): g Text[][][][][] NOT NULL, -- String h Integer[][][], -- Nullable(Int32) i Char(2)[][][][], -- Nullable(String) - k Char(2)[] -- Nullable(String) + j Char(2)[], -- Nullable(String) + k UUID[], -- Nullable(UUID) + l UUID[][] -- Nullable(UUID) )""" ) @@ -123,15 +125,18 @@ def test_postgres_conversions(started_cluster): ) expected = ( "a\tArray(Date)\t\t\t\t\t\n" - + "b\tArray(DateTime64(6))\t\t\t\t\t\n" - + "c\tArray(Array(Float32))\t\t\t\t\t\n" - + "d\tArray(Array(Float64))\t\t\t\t\t\n" - + "e\tArray(Array(Array(Decimal(5, 5))))\t\t\t\t\t\n" - + "f\tArray(Array(Array(Int32)))\t\t\t\t\t\n" - + "g\tArray(Array(Array(Array(Array(String)))))\t\t\t\t\t\n" - + "h\tArray(Array(Array(Nullable(Int32))))\t\t\t\t\t\n" - + "i\tArray(Array(Array(Array(Nullable(String)))))\t\t\t\t\t\n" - + "k\tArray(Nullable(String))" + "b\tArray(DateTime64(6))\t\t\t\t\t\n" + "c\tArray(Array(Float32))\t\t\t\t\t\n" + "d\tArray(Array(Float64))\t\t\t\t\t\n" + "e\tArray(Array(Array(Decimal(5, 5))))\t\t\t\t\t\n" + "f\tArray(Array(Array(Int32)))\t\t\t\t\t\n" + "g\tArray(Array(Array(Array(Array(String)))))\t\t\t\t\t\n" + "h\tArray(Array(Array(Nullable(Int32))))\t\t\t\t\t\n" + "i\tArray(Array(Array(Array(Nullable(String)))))\t\t\t\t\t\n" + "j\tArray(Nullable(String))\t\t\t\t\t\n" + "k\tArray(Nullable(UUID))\t\t\t\t\t\n" + "l\tArray(Array(Nullable(UUID)))" + "" ) assert result.rstrip() == expected @@ -147,7 +152,9 @@ def test_postgres_conversions(started_cluster): "[[[[['winx', 'winx', 'winx']]]]], " "[[[1, NULL], [NULL, 1]], [[NULL, NULL], [NULL, NULL]], [[4, 4], [5, 5]]], " "[[[[NULL]]]], " - "[]" + "[], " + "['2a0c0bfc-4fec-4e32-ae3a-7fc8eea6626a', '42209d53-d641-4d73-a8b6-c038db1e75d6', NULL], " + "[[NULL, '42209d53-d641-4d73-a8b6-c038db1e75d6'], ['2a0c0bfc-4fec-4e32-ae3a-7fc8eea6626a', NULL], [NULL, NULL]]" ")" ) @@ -157,15 +164,17 @@ def test_postgres_conversions(started_cluster): ) expected = ( "['2000-05-12','2000-05-12']\t" - + "['2000-05-12 12:12:12.012345','2000-05-12 12:12:12.012345']\t" - + "[[1.12345],[1.12345],[1.12345]]\t" - + "[[1.1234567891],[1.1234567891],[1.1234567891]]\t" - + "[[[0.11111,0.11111]],[[0.22222,0.22222]],[[0.33333,0.33333]]]\t" + "['2000-05-12 12:12:12.012345','2000-05-12 12:12:12.012345']\t" + "[[1.12345],[1.12345],[1.12345]]\t" + "[[1.1234567891],[1.1234567891],[1.1234567891]]\t" + "[[[0.11111,0.11111]],[[0.22222,0.22222]],[[0.33333,0.33333]]]\t" "[[[1,1],[1,1]],[[3,3],[3,3]],[[4,4],[5,5]]]\t" "[[[[['winx','winx','winx']]]]]\t" "[[[1,NULL],[NULL,1]],[[NULL,NULL],[NULL,NULL]],[[4,4],[5,5]]]\t" "[[[[NULL]]]]\t" - "[]\n" + "[]\t" + "['2a0c0bfc-4fec-4e32-ae3a-7fc8eea6626a','42209d53-d641-4d73-a8b6-c038db1e75d6',NULL]\t" + "[[NULL,'42209d53-d641-4d73-a8b6-c038db1e75d6'],['2a0c0bfc-4fec-4e32-ae3a-7fc8eea6626a',NULL],[NULL,NULL]]\n" ) assert result == expected From e5aa3fcc8fdf8bbc45756728f11a9a55c2f17fb6 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Wed, 17 May 2023 19:09:13 +0000 Subject: [PATCH 0088/1072] Add queries with enabled analyzer --- .../0_stateless/01655_plan_optimizations.sh | 44 ++++++++++++++++--- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/tests/queries/0_stateless/01655_plan_optimizations.sh b/tests/queries/0_stateless/01655_plan_optimizations.sh index ec856c9bf27..7c299f9cc26 100755 --- a/tests/queries/0_stateless/01655_plan_optimizations.sh +++ b/tests/queries/0_stateless/01655_plan_optimizations.sh @@ -26,11 +26,17 @@ $CLICKHOUSE_CLIENT -q " settings enable_optimize_predicate_expression=0" echo "> filter should be pushed down after aggregating, column after aggregation is const" -$CLICKHOUSE_CLIENT -q " +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=0 -q " explain actions = 1 select s, y, y != 0 from (select sum(x) as s, y from ( select number as x, number + 1 as y from numbers(10)) group by y ) where y != 0 settings enable_optimize_predicate_expression=0" | grep -o "Aggregating\|Filter\|COLUMN Const(UInt8) -> notEquals(y, 0)" +echo "> (analyzer) filter should be pushed down after aggregating, column after aggregation is const" +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=1 -q " + explain actions = 1 select s, y, y != 0 from (select sum(x) as s, y from ( + select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 + settings enable_optimize_predicate_expression=0" | grep -o "Aggregating\|Filter\|COLUMN Const(UInt8) -> notEquals(y_1, 0_UInt8)" $CLICKHOUSE_CLIENT -q " select s, y, y != 0 from (select sum(x) as s, y from ( select number as x, number + 1 as y from numbers(10)) group by y @@ -38,12 +44,19 @@ $CLICKHOUSE_CLIENT -q " settings enable_optimize_predicate_expression=0" echo "> one condition of filter should be pushed down after aggregating, other condition is aliased" -$CLICKHOUSE_CLIENT -q " +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=0 -q " explain actions = 1 select s, y from ( select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y ) where y != 0 and s != 4 settings enable_optimize_predicate_expression=0" | grep -o "Aggregating\|Filter column\|Filter column: notEquals(y, 0)\|ALIAS notEquals(s, 4) :: 1 -> and(notEquals(y, 0), notEquals(s, 4))" +echo "> (analyzer) one condition of filter should be pushed down after aggregating, other condition is aliased" +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=1 -q " + explain actions = 1 select s, y from ( + select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 and s != 4 + settings enable_optimize_predicate_expression=0" | + grep -o "Aggregating\|Filter column\|Filter column: notEquals(y_1, 0_UInt8)\|ALIAS notEquals(s_0, 4_UInt8) :: 0 -> and(notEquals(y_1, 0_UInt8), notEquals(s_0, 4_UInt8))" $CLICKHOUSE_CLIENT -q " select s, y from ( select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y @@ -51,12 +64,19 @@ $CLICKHOUSE_CLIENT -q " settings enable_optimize_predicate_expression=0" echo "> one condition of filter should be pushed down after aggregating, other condition is casted" -$CLICKHOUSE_CLIENT -q " +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=0 -q " explain actions = 1 select s, y from ( select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y ) where y != 0 and s - 4 settings enable_optimize_predicate_expression=0" | grep -o "Aggregating\|Filter column\|Filter column: notEquals(y, 0)\|FUNCTION and(minus(s, 4) :: 1, 1 :: 3) -> and(notEquals(y, 0), minus(s, 4)) UInt8 : 2" +echo "> (analyzer) one condition of filter should be pushed down after aggregating, other condition is casted" +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=1 -q " + explain actions = 1 select s, y from ( + select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 and s - 4 + settings enable_optimize_predicate_expression=0" | + grep -o "Aggregating\|Filter column\|Filter column: notEquals(y_1, 0_UInt8)\|FUNCTION and(minus(s_0, 4_UInt8) :: 0, 1 :: 3) -> and(notEquals(y_1, 0_UInt8), minus(s_0, 4_UInt8)) UInt8 : 2" $CLICKHOUSE_CLIENT -q " select s, y from ( select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y @@ -64,12 +84,19 @@ $CLICKHOUSE_CLIENT -q " settings enable_optimize_predicate_expression=0" echo "> one condition of filter should be pushed down after aggregating, other two conditions are ANDed" -$CLICKHOUSE_CLIENT --convert_query_to_cnf=0 -q " +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=0 --convert_query_to_cnf=0 -q " explain actions = 1 select s, y from ( select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y ) where y != 0 and s - 8 and s - 4 settings enable_optimize_predicate_expression=0" | grep -o "Aggregating\|Filter column\|Filter column: notEquals(y, 0)\|FUNCTION and(minus(s, 8) :: 1, minus(s, 4) :: 2) -> and(notEquals(y, 0), minus(s, 8), minus(s, 4))" +echo "> (analyzer) one condition of filter should be pushed down after aggregating, other two conditions are ANDed" +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=1 --convert_query_to_cnf=0 -q " + explain actions = 1 select s, y from ( + select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 and s - 8 and s - 4 + settings enable_optimize_predicate_expression=0" | + grep -o "Aggregating\|Filter column\|Filter column: notEquals(y_1, 0_UInt8)\|FUNCTION and(minus(s_0, 8_UInt8) :: 0, minus(s_0, 4_UInt8) :: 2) -> and(notEquals(y_1, 0_UInt8), minus(s_0, 8_UInt8), minus(s_0, 4_UInt8))" $CLICKHOUSE_CLIENT -q " select s, y from ( select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y @@ -77,12 +104,19 @@ $CLICKHOUSE_CLIENT -q " settings enable_optimize_predicate_expression=0" echo "> two conditions of filter should be pushed down after aggregating and ANDed, one condition is aliased" -$CLICKHOUSE_CLIENT --convert_query_to_cnf=0 -q " +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=0 --convert_query_to_cnf=0 -q " explain actions = 1 select s, y from ( select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y ) where y != 0 and s != 8 and y - 4 settings enable_optimize_predicate_expression=0" | grep -o "Aggregating\|Filter column\|Filter column: and(notEquals(y, 0), minus(y, 4))\|ALIAS notEquals(s, 8) :: 1 -> and(notEquals(y, 0), notEquals(s, 8), minus(y, 4))" +echo "> (analyzer) two conditions of filter should be pushed down after aggregating and ANDed, one condition is aliased" +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=1 --convert_query_to_cnf=0 -q " + explain actions = 1 select s, y from ( + select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 and s != 8 and y - 4 + settings enable_optimize_predicate_expression=0" | + grep -o "Aggregating\|Filter column\|Filter column: and(notEquals(y_1, 0_UInt8), minus(y_1, 4_UInt8))\|ALIAS notEquals(s_0, 8_UInt8) :: 0 -> and(notEquals(y_1, 0_UInt8), notEquals(s_0, 8_UInt8), minus(y_1, 4_UInt8))" $CLICKHOUSE_CLIENT -q " select s, y from ( select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y From 971cc092d4da472fa6a3a0726616218d6a783b58 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Thu, 18 May 2023 15:16:47 +0200 Subject: [PATCH 0089/1072] Update src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp --- src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp b/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp index 2510d6f2d19..30f31910bee 100644 --- a/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp @@ -366,9 +366,6 @@ MergeTreePrefetchedReadPool::PartsInfos MergeTreePrefetchedReadPool::getPartsInf part_info->column_name_set = {required_column_names.begin(), required_column_names.end()}; part_info->task_columns = task_columns; - if (settings.prefetch_buffer_size < DBMS_DEFAULT_BUFFER_SIZE) - throw Exception(ErrorCodes::LOGICAL_ERROR, "remove me"); - /// adjustBufferSize(), which is done in MergeTreeReaderStream and MergeTreeReaderCompact, /// lowers buffer size if file size (or required read range) is less. So we know that the /// settings.prefetch_buffer_size will be lowered there, therefore we account it here as well. From 3121a57912752d70ac46402f46c695181571dea3 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 19 May 2023 14:21:07 +0200 Subject: [PATCH 0090/1072] Add some assertions --- .../IO/CachedOnDiskReadBufferFromFile.cpp | 47 +++++++++++-------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index 00d23183f6a..b4ea30e54c8 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -708,14 +708,18 @@ bool CachedOnDiskReadBufferFromFile::updateImplementationBufferIfNeeded() } else if (current_write_offset < file_offset_of_buffer_end) { + const auto path = file_segment.getPathInLocalCache(); + size_t file_size = 0; + if (fs::exists(path)) + file_size = fs::file_size(path); + throw Exception( ErrorCodes::LOGICAL_ERROR, - "Expected {} >= {} ({})", - current_write_offset, file_offset_of_buffer_end, getInfoForLog()); + "Invariant failed. Expected {} >= {} (size on fs: {}, {})", + current_write_offset, file_offset_of_buffer_end, file_size, getInfoForLog()); } } - - if (read_type == ReadType::REMOTE_FS_READ_AND_PUT_IN_CACHE) + else if (read_type == ReadType::REMOTE_FS_READ_AND_PUT_IN_CACHE) { /** * ReadType::REMOTE_FS_READ_AND_PUT_IN_CACHE means that on previous getImplementationBuffer() call @@ -884,25 +888,28 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep() if (!result) { -#ifndef NDEBUG - if (read_type == ReadType::CACHED) + auto debug_check = [&]() { - size_t cache_file_size = getFileSizeFromReadBuffer(*implementation_buffer); - if (cache_file_size == 0) + if (read_type == ReadType::CACHED) { - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "Attempt to read from an empty cache file: {} (just before actual read)", - cache_file_size); + size_t cache_file_size = getFileSizeFromReadBuffer(*implementation_buffer); + if (cache_file_size == 0) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Attempt to read from an empty cache file: {} (just before actual read)", + cache_file_size); + } } - } - else - { - assert(file_offset_of_buffer_end == static_cast(implementation_buffer->getFileOffsetOfBufferEnd())); - } + else + { + chassert(file_offset_of_buffer_end == static_cast(implementation_buffer->getFileOffsetOfBufferEnd())); + } + chassert(!implementation_buffer->hasPendingData()); + return true; + }; - assert(!implementation_buffer->hasPendingData()); -#endif + chassert(debug_check()); Stopwatch watch(CLOCK_MONOTONIC); @@ -927,6 +934,8 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep() { ProfileEvents::increment(ProfileEvents::CachedReadBufferReadFromCacheBytes, size); ProfileEvents::increment(ProfileEvents::CachedReadBufferReadFromCacheMicroseconds, elapsed); + + chassert(file_offset_of_buffer_end + size <= file_segment.range().size()); } else { From c85c3afa1f50307d9a92d24559fe9628fe8cee37 Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Sun, 7 May 2023 12:18:52 +0000 Subject: [PATCH 0091/1072] Added option to rename files, loaded via TableFunctionFile, after success processing --- src/Client/ClientBase.cpp | 1 + src/Common/FileRenamer.cpp | 99 ++++++++++++++++++++++++ src/Common/FileRenamer.h | 39 ++++++++++ src/Core/Settings.h | 2 + src/Storages/StorageFile.cpp | 64 +++++++++++++++ src/Storages/StorageFile.h | 10 ++- src/TableFunctions/TableFunctionFile.cpp | 1 + 7 files changed, 214 insertions(+), 2 deletions(-) create mode 100644 src/Common/FileRenamer.cpp create mode 100644 src/Common/FileRenamer.h diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 77a93a25e9b..571637c6005 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1361,6 +1361,7 @@ void ClientBase::sendData(Block & sample, const ColumnsDescription & columns_des columns_description_for_query, ConstraintsDescription{}, String{}, + {}, }; StoragePtr storage = std::make_shared(in_file, global_context->getUserFilesPath(), args); storage->startup(); diff --git a/src/Common/FileRenamer.cpp b/src/Common/FileRenamer.cpp new file mode 100644 index 00000000000..7a19c50a0d1 --- /dev/null +++ b/src/Common/FileRenamer.cpp @@ -0,0 +1,99 @@ +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace fs = std::filesystem; + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +FileRenamer::FileRenamer() = default; + +FileRenamer::FileRenamer(const String & renaming_rule) + : rule(renaming_rule) +{ + FileRenamer::validateRenamingRule(rule, true); +} + +String FileRenamer::generateNewFilename(const String & filename) const +{ + // Split filename and extension + String file_base = fs::path(filename).stem(); + String file_ext = fs::path(filename).extension(); + + // Get current timestamp in microseconds + String timestamp; + if (rule.find("%t") != String::npos) + { + auto now = std::chrono::system_clock::now(); + std::stringstream ss; + ss << timeInMicroseconds(now); + timestamp = ss.str(); + } + + // Define placeholders and their corresponding values + std::map placeholders = + { + {"%f", file_base}, + {"%e", file_ext}, + {"%t", timestamp}, + {"%%", "%"} + }; + + // Replace placeholders with their actual values + String new_name = rule; + for (const auto & [placeholder, value] : placeholders) + boost::replace_all(new_name, placeholder, value); + + return new_name; +} + +bool FileRenamer::isEmpty() const +{ + return rule.empty(); +} + +bool FileRenamer::validateRenamingRule(const String & rule, bool throw_on_error) +{ + // Check if the rule contains invalid placeholders + re2::RE2 invalid_placeholder_pattern("^([^%]|%[fet%])*$"); + if (!re2::RE2::FullMatch(rule, invalid_placeholder_pattern)) + { + if (throw_on_error) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid renaming rule: Allowed placeholders only %f, %e, %t, and %%"); + return false; + } + + // Replace valid placeholders with empty strings and count remaining percentage signs. + String replaced_rule = rule; + boost::replace_all(replaced_rule, "%f", ""); + boost::replace_all(replaced_rule, "%e", ""); + boost::replace_all(replaced_rule, "%t", ""); + if (std::count(replaced_rule.begin(), replaced_rule.end(), '%') % 2) + { + if (throw_on_error) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid renaming rule: Odd number of consecutive percentage signs"); + return false; + } + + return true; +} + + +} // DB diff --git a/src/Common/FileRenamer.h b/src/Common/FileRenamer.h new file mode 100644 index 00000000000..c062978d6f6 --- /dev/null +++ b/src/Common/FileRenamer.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/** + * The FileRenamer class provides functionality for renaming files based on given pattern with placeholders + * The supported placeholders are: + * %f - Original filename without extension ("sample") + * %e - Original file extension with dot (".csv") + * %t - Timestamp (in microseconds) + * %% - Percentage sign ("%") + * + * Example: + * Pattern - "processed_%f_%t%e" + * Original filename - "sample.csv" + * New filename - "processed_sample_1683405960646224.csv" + */ +class FileRenamer +{ +public: + FileRenamer(); + + FileRenamer(const String & renaming_rule); + + String generateNewFilename(const String & filename) const; + + bool isEmpty() const; + + static bool validateRenamingRule(const String & rule, bool throw_on_error = false); + +private: + String rule; +}; + +} // DB diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 69546011770..d0ce641efb5 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -713,6 +713,8 @@ class IColumn; M(String, workload, "default", "Name of workload to be used to access resources", 0) \ M(Milliseconds, storage_system_stack_trace_pipe_read_timeout_ms, 100, "Maximum time to read from a pipe for receiving information from the threads when querying the `system.stack_trace` table. This setting is used for testing purposes and not meant to be changed by users.", 0) \ \ + M(String, rename_files_after_processing, "", "Rename successfully processed files according to the specified pattern; Pattern can include the following placeholders: `%f` (original filename without extension), `%e` (file extension with dot), `%t` (current timestamp in µs), and `%%` (% sign)", 0) \ + \ M(Bool, parallelize_output_from_storages, true, "Parallelize output for reading step from storage. It allows parallelizing query processing right after reading from storage if possible", 0) \ M(String, insert_deduplication_token, "", "If not empty, used for duplicate detection instead of data digest", 0) \ M(String, ann_index_select_query_params, "", "Parameters passed to ANN indexes in SELECT queries, the format is 'param1=x, param2=y, ...'", 0) \ diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 425fe6bee31..2ea8da1a873 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -76,6 +77,7 @@ namespace ErrorCodes extern const int UNKNOWN_IDENTIFIER; extern const int INCORRECT_FILE_NAME; extern const int FILE_DOESNT_EXIST; + extern const int FILE_ALREADY_EXISTS; extern const int TIMEOUT_EXCEEDED; extern const int INCOMPATIBLE_COLUMNS; extern const int CANNOT_STAT; @@ -460,6 +462,8 @@ StorageFile::StorageFile(const std::string & table_path_, const std::string & us else path_for_partitioned_write = table_path_; + file_renamer = FileRenamer(args.rename_after_processing); + setStorageMetadata(args); } @@ -593,9 +597,68 @@ public: shared_lock = std::shared_lock(storage->rwlock, getLockTimeout(context)); if (!shared_lock) throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Lock timeout exceeded"); + storage->readers_counter.fetch_add(1, std::memory_order_release); } } + + /** + * If specified option --rename_files_after_processing and files created by TableFunctionFile + * Last reader will rename files according to specified patten if desctuctor of reader was called without uncaught exceptions + */ + void beforeDestroy() + { + if (storage->file_renamer.isEmpty()) + return; + + int32_t cnt = storage->readers_counter.fetch_sub(1, std::memory_order_acq_rel); + + if (std::uncaught_exceptions() == 0 && cnt == 1 && !storage->was_renamed) + { + shared_lock.unlock(); + auto exclusive_lock = std::unique_lock{storage->rwlock, getLockTimeout(context)}; + + if (!exclusive_lock) + return; + if (storage->readers_counter.load(std::memory_order_acquire) != 0 || storage->was_renamed) + return; + + for (auto & file_path_ref : storage->paths) { + try + { + auto file_path = fs::path(file_path_ref); + String new_filename = storage->file_renamer.generateNewFilename(file_path.filename().string()); + file_path.replace_filename(new_filename); + + // Normalize new path + file_path = file_path.lexically_normal(); + + // Checking access rights + checkCreationIsAllowed(context, context->getUserFilesPath(), file_path, true); + + // Checking an existing of new file + if (fs::exists(file_path)) + throw Exception(ErrorCodes::FILE_ALREADY_EXISTS, "File {} already exists", file_path.string()); + + fs::rename(fs::path(file_path_ref), file_path); + file_path_ref = file_path.string(); + storage->was_renamed = true; + } + catch (const std::exception & e) + { + // Cannot throw exception from destructor, will write only error + LOG_ERROR(&Poco::Logger::get("~StorageFileSource"), "Failed to rename file {}: {}", file_path_ref, e.what()); + continue; + } + } + } + } + + ~StorageFileSource() override + { + beforeDestroy(); + } + String getName() const override { return storage->getName(); @@ -1217,6 +1280,7 @@ void registerStorageFile(StorageFactory & factory) factory_args.columns, factory_args.constraints, factory_args.comment, + {}, }; ASTs & engine_args_ast = factory_args.engine_args; diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index 53ce7eeaaf6..0513864fd0f 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -2,12 +2,11 @@ #include #include - +#include #include #include - namespace DB { @@ -23,6 +22,8 @@ public: const ColumnsDescription & columns; const ConstraintsDescription & constraints; const String & comment; + + const std::string rename_after_processing; }; /// From file descriptor @@ -139,6 +140,11 @@ private: std::unique_ptr read_buffer_from_fd; std::unique_ptr peekable_read_buffer_from_fd; std::atomic has_peekable_read_buffer_from_fd = false; + + // Counts the number of readers + std::atomic readers_counter = 0; + FileRenamer file_renamer; + bool was_renamed = false; }; } diff --git a/src/TableFunctions/TableFunctionFile.cpp b/src/TableFunctions/TableFunctionFile.cpp index ff64bb3dc67..0e49f26db40 100644 --- a/src/TableFunctions/TableFunctionFile.cpp +++ b/src/TableFunctions/TableFunctionFile.cpp @@ -75,6 +75,7 @@ StoragePtr TableFunctionFile::getStorage(const String & source, columns, ConstraintsDescription{}, String{}, + global_context->getSettingsRef().rename_files_after_processing, }; if (fd >= 0) return std::make_shared(fd, args); From 7b3964ff7a63ce6a4027f0dfb8f06c019239ac1c Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Sun, 7 May 2023 14:59:40 +0000 Subject: [PATCH 0092/1072] Added test --- .../02732_rename_after_processing.reference | 17 ++++ .../02732_rename_after_processing.sh | 77 +++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 tests/queries/0_stateless/02732_rename_after_processing.reference create mode 100755 tests/queries/0_stateless/02732_rename_after_processing.sh diff --git a/tests/queries/0_stateless/02732_rename_after_processing.reference b/tests/queries/0_stateless/02732_rename_after_processing.reference new file mode 100644 index 00000000000..26e152f6b10 --- /dev/null +++ b/tests/queries/0_stateless/02732_rename_after_processing.reference @@ -0,0 +1,17 @@ +4 +processed_tmp1.csv +OK +10 10 +processed_tmp2.csv +OK +8 +processed_tmp3_1.csv +processed_tmp3_2.csv +OK +OK +4 +OK +OK +tmp5.csv +OK +tmp5.csv diff --git a/tests/queries/0_stateless/02732_rename_after_processing.sh b/tests/queries/0_stateless/02732_rename_after_processing.sh new file mode 100755 index 00000000000..05fbfb716ec --- /dev/null +++ b/tests/queries/0_stateless/02732_rename_after_processing.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# see 01658_read_file_to_stringcolumn.sh +CLICKHOUSE_USER_FILES_PATH=$(clickhouse-client --query "select _path, _file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') + +# Prepare data +mkdir -p ${CLICKHOUSE_USER_FILES_PATH} +echo '"id","str","int","text"' > ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv +echo '1,"abc",123,"abacaba"' >> ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv +echo '2,"def",456,"bacabaa"' >> ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv +echo '3,"story",78912,"acabaab"' >> ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv +echo '4,"history",21321321,"cabaaba"' >> ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv + +cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp1.csv +cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp2.csv +cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp3_1.csv +cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp3_2.csv +cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp4.csv +cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp5.csv + +### Checking that renaming works + +# simple select +${CLICKHOUSE_CLIENT} --rename-files-after-processing="processed_%f%e" -q "SELECT COUNT(*) FROM file('tmp1.csv')" +ls ${CLICKHOUSE_USER_FILES_PATH} | grep "processed_tmp1.csv" +if [ ! -e "${CLICKHOUSE_USER_FILES_PATH}/tmp1.csv" ]; then + echo "OK" +fi + +# select with multiple file() calls +${CLICKHOUSE_CLIENT} --rename-files-after-processing="processed_%f%e" --multiline -q """ +SELECT + sum(a.id) as aid, + sum(b.id) as bid +FROM file('tmp2.csv') AS a +INNER JOIN file('tmp2.csv') AS b +ON a.text = b.text +""" +ls ${CLICKHOUSE_USER_FILES_PATH} | grep "processed_tmp2.csv" +if [ ! -e "${CLICKHOUSE_USER_FILES_PATH}/tmp2.csv" ]; then + echo "OK" +fi + +# rename multiple files +${CLICKHOUSE_CLIENT} --rename-files-after-processing="processed_%f%e" -q "SELECT COUNT(*) FROM file('tmp3*.csv')" +ls ${CLICKHOUSE_USER_FILES_PATH} | grep "processed_tmp3_1.csv" +ls ${CLICKHOUSE_USER_FILES_PATH} | grep "processed_tmp3_2.csv" +if [ ! -e "${CLICKHOUSE_USER_FILES_PATH}/tmp3_1.csv" ]; then + echo "OK" +fi +if [ ! -e "${CLICKHOUSE_USER_FILES_PATH}/tmp3_2.csv" ]; then + echo "OK" +fi + +# check timestamp placeholder +${CLICKHOUSE_CLIENT} --rename-files-after-processing="processed_%f_%t.csv" -q "SELECT COUNT(*) FROM file('tmp4.csv')" +ls ${CLICKHOUSE_USER_FILES_PATH} | grep -E "^processed_tmp4_[0-9]+\.csv$" > /dev/null && echo "OK" + +### Checking errors + +# cannot overwrite an existing file +${CLICKHOUSE_CLIENT} --rename-files-after-processing="tmp.csv" -q "SELECT COUNT(*) FROM file('tmp5.csv')" \ + 2>&1| grep "already exists" > /dev/null && echo "OK" +ls ${CLICKHOUSE_USER_FILES_PATH} | grep "tmp5.csv" + +# сannot move file from user_files +${CLICKHOUSE_CLIENT} --rename-files-after-processing="../%f%e" -q "SELECT COUNT(*) FROM file('tmp5.csv')" \ + 2>&1| grep "is not inside" > /dev/null && echo "OK" +ls ${CLICKHOUSE_USER_FILES_PATH} | grep "tmp5.csv" + +# Clean +rm -rd $CLICKHOUSE_USER_FILES_PATH \ No newline at end of file From 2b68a6a22a84329f639af953e1fdaad9a6e21584 Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Sun, 7 May 2023 17:43:34 +0000 Subject: [PATCH 0093/1072] Fix style --- src/Common/FileRenamer.cpp | 5 +-- src/Storages/StorageFile.cpp | 5 ++- .../02732_rename_after_processing.reference | 8 ++-- .../02732_rename_after_processing.sh | 43 +++++++++++++------ 4 files changed, 39 insertions(+), 22 deletions(-) diff --git a/src/Common/FileRenamer.cpp b/src/Common/FileRenamer.cpp index 7a19c50a0d1..3473d543c00 100644 --- a/src/Common/FileRenamer.cpp +++ b/src/Common/FileRenamer.cpp @@ -8,7 +8,6 @@ #include #include #include -#include #include #include @@ -42,9 +41,7 @@ String FileRenamer::generateNewFilename(const String & filename) const if (rule.find("%t") != String::npos) { auto now = std::chrono::system_clock::now(); - std::stringstream ss; - ss << timeInMicroseconds(now); - timestamp = ss.str(); + timestamp = std::to_string(timeInMicroseconds(now)); } // Define placeholders and their corresponding values diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 2ea8da1a873..06af0a00953 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -604,7 +604,7 @@ public: /** * If specified option --rename_files_after_processing and files created by TableFunctionFile - * Last reader will rename files according to specified patten if desctuctor of reader was called without uncaught exceptions + * Last reader will rename files according to specified pattern if desctuctor of reader was called without uncaught exceptions */ void beforeDestroy() { @@ -623,7 +623,8 @@ public: if (storage->readers_counter.load(std::memory_order_acquire) != 0 || storage->was_renamed) return; - for (auto & file_path_ref : storage->paths) { + for (auto & file_path_ref : storage->paths) + { try { auto file_path = fs::path(file_path_ref); diff --git a/tests/queries/0_stateless/02732_rename_after_processing.reference b/tests/queries/0_stateless/02732_rename_after_processing.reference index 26e152f6b10..2f6ccfc1c5e 100644 --- a/tests/queries/0_stateless/02732_rename_after_processing.reference +++ b/tests/queries/0_stateless/02732_rename_after_processing.reference @@ -1,14 +1,14 @@ 4 processed_tmp1.csv -OK +!tmp1.csv 10 10 processed_tmp2.csv -OK +!tmp2.csv 8 processed_tmp3_1.csv processed_tmp3_2.csv -OK -OK +!tmp3_1.csv +!tmp3_2.csv 4 OK OK diff --git a/tests/queries/0_stateless/02732_rename_after_processing.sh b/tests/queries/0_stateless/02732_rename_after_processing.sh index 05fbfb716ec..93bad2eac7d 100755 --- a/tests/queries/0_stateless/02732_rename_after_processing.sh +++ b/tests/queries/0_stateless/02732_rename_after_processing.sh @@ -27,9 +27,11 @@ cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp5.csv # simple select ${CLICKHOUSE_CLIENT} --rename-files-after-processing="processed_%f%e" -q "SELECT COUNT(*) FROM file('tmp1.csv')" -ls ${CLICKHOUSE_USER_FILES_PATH} | grep "processed_tmp1.csv" +if [ -e "${CLICKHOUSE_USER_FILES_PATH}/processed_tmp1.csv" ]; then + echo "processed_tmp1.csv" +fi if [ ! -e "${CLICKHOUSE_USER_FILES_PATH}/tmp1.csv" ]; then - echo "OK" + echo "!tmp1.csv" fi # select with multiple file() calls @@ -41,37 +43,54 @@ FROM file('tmp2.csv') AS a INNER JOIN file('tmp2.csv') AS b ON a.text = b.text """ -ls ${CLICKHOUSE_USER_FILES_PATH} | grep "processed_tmp2.csv" +if [ -e "${CLICKHOUSE_USER_FILES_PATH}/processed_tmp2.csv" ]; then + echo "processed_tmp2.csv" +fi if [ ! -e "${CLICKHOUSE_USER_FILES_PATH}/tmp2.csv" ]; then - echo "OK" + echo "!tmp2.csv" fi # rename multiple files ${CLICKHOUSE_CLIENT} --rename-files-after-processing="processed_%f%e" -q "SELECT COUNT(*) FROM file('tmp3*.csv')" -ls ${CLICKHOUSE_USER_FILES_PATH} | grep "processed_tmp3_1.csv" -ls ${CLICKHOUSE_USER_FILES_PATH} | grep "processed_tmp3_2.csv" +if [ -e "${CLICKHOUSE_USER_FILES_PATH}/processed_tmp3_1.csv" ]; then + echo "processed_tmp3_1.csv" +fi +if [ -e "${CLICKHOUSE_USER_FILES_PATH}/processed_tmp3_2.csv" ]; then + echo "processed_tmp3_2.csv" +fi if [ ! -e "${CLICKHOUSE_USER_FILES_PATH}/tmp3_1.csv" ]; then - echo "OK" + echo "!tmp3_1.csv" fi if [ ! -e "${CLICKHOUSE_USER_FILES_PATH}/tmp3_2.csv" ]; then - echo "OK" + echo "!tmp3_2.csv" fi # check timestamp placeholder ${CLICKHOUSE_CLIENT} --rename-files-after-processing="processed_%f_%t.csv" -q "SELECT COUNT(*) FROM file('tmp4.csv')" -ls ${CLICKHOUSE_USER_FILES_PATH} | grep -E "^processed_tmp4_[0-9]+\.csv$" > /dev/null && echo "OK" +# ls ${CLICKHOUSE_USER_FILES_PATH} | grep -E "^processed_tmp4_[0-9]+\.csv$" > /dev/null && echo "OK" +rg="processed_tmp4_[0-9]+\.csv" +for x in "${CLICKHOUSE_USER_FILES_PATH}"/processed*; do + if [[ $x =~ $rg ]]; then + echo "OK" + break + fi; +done ### Checking errors # cannot overwrite an existing file ${CLICKHOUSE_CLIENT} --rename-files-after-processing="tmp.csv" -q "SELECT COUNT(*) FROM file('tmp5.csv')" \ 2>&1| grep "already exists" > /dev/null && echo "OK" -ls ${CLICKHOUSE_USER_FILES_PATH} | grep "tmp5.csv" +if [ -e "${CLICKHOUSE_USER_FILES_PATH}/tmp5.csv" ]; then + echo "tmp5.csv" +fi # сannot move file from user_files ${CLICKHOUSE_CLIENT} --rename-files-after-processing="../%f%e" -q "SELECT COUNT(*) FROM file('tmp5.csv')" \ 2>&1| grep "is not inside" > /dev/null && echo "OK" -ls ${CLICKHOUSE_USER_FILES_PATH} | grep "tmp5.csv" +if [ -e "${CLICKHOUSE_USER_FILES_PATH}/tmp5.csv" ]; then + echo "tmp5.csv" +fi # Clean -rm -rd $CLICKHOUSE_USER_FILES_PATH \ No newline at end of file +rm -rd $CLICKHOUSE_USER_FILES_PATH From eb7b48aab20172b79b1573c05c8a4baa02fe0804 Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Tue, 16 May 2023 18:48:39 +0000 Subject: [PATCH 0094/1072] Fix test issues --- .../02732_rename_after_processing.reference | 4 + .../02732_rename_after_processing.sh | 85 +++++++++++-------- 2 files changed, 55 insertions(+), 34 deletions(-) diff --git a/tests/queries/0_stateless/02732_rename_after_processing.reference b/tests/queries/0_stateless/02732_rename_after_processing.reference index 2f6ccfc1c5e..39cdb677e09 100644 --- a/tests/queries/0_stateless/02732_rename_after_processing.reference +++ b/tests/queries/0_stateless/02732_rename_after_processing.reference @@ -15,3 +15,7 @@ OK tmp5.csv OK tmp5.csv +OK +tmp5.csv +OK +tmp5.csv diff --git a/tests/queries/0_stateless/02732_rename_after_processing.sh b/tests/queries/0_stateless/02732_rename_after_processing.sh index 93bad2eac7d..dbf2427d2dc 100755 --- a/tests/queries/0_stateless/02732_rename_after_processing.sh +++ b/tests/queries/0_stateless/02732_rename_after_processing.sh @@ -1,5 +1,4 @@ #!/usr/bin/env bash -# Tags: no-parallel CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -9,28 +8,30 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) CLICKHOUSE_USER_FILES_PATH=$(clickhouse-client --query "select _path, _file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') # Prepare data -mkdir -p ${CLICKHOUSE_USER_FILES_PATH} -echo '"id","str","int","text"' > ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv -echo '1,"abc",123,"abacaba"' >> ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv -echo '2,"def",456,"bacabaa"' >> ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv -echo '3,"story",78912,"acabaab"' >> ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv -echo '4,"history",21321321,"cabaaba"' >> ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv +unique_name=${CLICKHOUSE_TEST_UNIQUE_NAME} +tmp_dir=${CLICKHOUSE_USER_FILES_PATH}/${unique_name} +mkdir -p $tmp_dir +echo '"id","str","int","text"' > ${tmp_dir}/tmp.csv +echo '1,"abc",123,"abacaba"' >> ${tmp_dir}/tmp.csv +echo '2,"def",456,"bacabaa"' >> ${tmp_dir}/tmp.csv +echo '3,"story",78912,"acabaab"' >> ${tmp_dir}/tmp.csv +echo '4,"history",21321321,"cabaaba"' >> ${tmp_dir}/tmp.csv -cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp1.csv -cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp2.csv -cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp3_1.csv -cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp3_2.csv -cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp4.csv -cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp5.csv +cp ${tmp_dir}/tmp.csv ${tmp_dir}/tmp1.csv +cp ${tmp_dir}/tmp.csv ${tmp_dir}/tmp2.csv +cp ${tmp_dir}/tmp.csv ${tmp_dir}/tmp3_1.csv +cp ${tmp_dir}/tmp.csv ${tmp_dir}/tmp3_2.csv +cp ${tmp_dir}/tmp.csv ${tmp_dir}/tmp4.csv +cp ${tmp_dir}/tmp.csv ${tmp_dir}/tmp5.csv ### Checking that renaming works # simple select -${CLICKHOUSE_CLIENT} --rename-files-after-processing="processed_%f%e" -q "SELECT COUNT(*) FROM file('tmp1.csv')" -if [ -e "${CLICKHOUSE_USER_FILES_PATH}/processed_tmp1.csv" ]; then +${CLICKHOUSE_CLIENT} --rename-files-after-processing="processed_%f%e" -q "SELECT COUNT(*) FROM file('${unique_name}/tmp1.csv')" +if [ -e "${tmp_dir}/processed_tmp1.csv" ]; then echo "processed_tmp1.csv" fi -if [ ! -e "${CLICKHOUSE_USER_FILES_PATH}/tmp1.csv" ]; then +if [ ! -e "${tmp_dir}/tmp1.csv" ]; then echo "!tmp1.csv" fi @@ -39,37 +40,37 @@ ${CLICKHOUSE_CLIENT} --rename-files-after-processing="processed_%f%e" --multilin SELECT sum(a.id) as aid, sum(b.id) as bid -FROM file('tmp2.csv') AS a -INNER JOIN file('tmp2.csv') AS b +FROM file('${unique_name}/tmp2.csv') AS a +INNER JOIN file('${unique_name}/tmp2.csv') AS b ON a.text = b.text """ -if [ -e "${CLICKHOUSE_USER_FILES_PATH}/processed_tmp2.csv" ]; then +if [ -e "${tmp_dir}/processed_tmp2.csv" ]; then echo "processed_tmp2.csv" fi -if [ ! -e "${CLICKHOUSE_USER_FILES_PATH}/tmp2.csv" ]; then +if [ ! -e "${tmp_dir}/tmp2.csv" ]; then echo "!tmp2.csv" fi # rename multiple files -${CLICKHOUSE_CLIENT} --rename-files-after-processing="processed_%f%e" -q "SELECT COUNT(*) FROM file('tmp3*.csv')" -if [ -e "${CLICKHOUSE_USER_FILES_PATH}/processed_tmp3_1.csv" ]; then +${CLICKHOUSE_CLIENT} --rename-files-after-processing="processed_%f%e" -q "SELECT COUNT(*) FROM file('${unique_name}/tmp3*.csv')" +if [ -e "${tmp_dir}/processed_tmp3_1.csv" ]; then echo "processed_tmp3_1.csv" fi -if [ -e "${CLICKHOUSE_USER_FILES_PATH}/processed_tmp3_2.csv" ]; then +if [ -e "${tmp_dir}/processed_tmp3_2.csv" ]; then echo "processed_tmp3_2.csv" fi -if [ ! -e "${CLICKHOUSE_USER_FILES_PATH}/tmp3_1.csv" ]; then +if [ ! -e "${tmp_dir}/tmp3_1.csv" ]; then echo "!tmp3_1.csv" fi -if [ ! -e "${CLICKHOUSE_USER_FILES_PATH}/tmp3_2.csv" ]; then +if [ ! -e "${tmp_dir}/tmp3_2.csv" ]; then echo "!tmp3_2.csv" fi # check timestamp placeholder -${CLICKHOUSE_CLIENT} --rename-files-after-processing="processed_%f_%t.csv" -q "SELECT COUNT(*) FROM file('tmp4.csv')" -# ls ${CLICKHOUSE_USER_FILES_PATH} | grep -E "^processed_tmp4_[0-9]+\.csv$" > /dev/null && echo "OK" +${CLICKHOUSE_CLIENT} --rename-files-after-processing="processed_%f_%t.csv" -q "SELECT COUNT(*) FROM file('${unique_name}/tmp4.csv')" +# ls ${tmp_dir} | grep -E "^processed_tmp4_[0-9]+\.csv$" > /dev/null && echo "OK" rg="processed_tmp4_[0-9]+\.csv" -for x in "${CLICKHOUSE_USER_FILES_PATH}"/processed*; do +for x in "${tmp_dir}"/processed*; do if [[ $x =~ $rg ]]; then echo "OK" break @@ -79,18 +80,34 @@ done ### Checking errors # cannot overwrite an existing file -${CLICKHOUSE_CLIENT} --rename-files-after-processing="tmp.csv" -q "SELECT COUNT(*) FROM file('tmp5.csv')" \ +${CLICKHOUSE_CLIENT} --rename-files-after-processing="tmp.csv" -q "SELECT COUNT(*) FROM file('${unique_name}/tmp5.csv')" \ 2>&1| grep "already exists" > /dev/null && echo "OK" -if [ -e "${CLICKHOUSE_USER_FILES_PATH}/tmp5.csv" ]; then +if [ -e "${tmp_dir}/tmp5.csv" ]; then echo "tmp5.csv" fi -# сannot move file from user_files -${CLICKHOUSE_CLIENT} --rename-files-after-processing="../%f%e" -q "SELECT COUNT(*) FROM file('tmp5.csv')" \ +# сannot move file outside user_files +${CLICKHOUSE_CLIENT} --rename-files-after-processing="../../%f%e" -q "SELECT COUNT(*) FROM file('${unique_name}/tmp5.csv')" \ 2>&1| grep "is not inside" > /dev/null && echo "OK" -if [ -e "${CLICKHOUSE_USER_FILES_PATH}/tmp5.csv" ]; then +if [ -e "${tmp_dir}/tmp5.csv" ]; then + echo "tmp5.csv" +fi + +# check invalid placeholders + +# unknown type of placeholder (%k) +${CLICKHOUSE_CLIENT} --rename-files-after-processing="processed_%f_%k" -q "SELECT COUNT(*) FROM file('${unique_name}/tmp5.csv')" \ + 2>&1| grep "Allowed placeholders only" > /dev/null && echo "OK" +if [ -e "${tmp_dir}/tmp5.csv" ]; then + echo "tmp5.csv" +fi + +# dd number of consecutive percentage signs after replace valid placeholders +${CLICKHOUSE_CLIENT} --rename-files-after-processing="processed_%f_%%%%e" -q "SELECT COUNT(*) FROM file('${unique_name}/tmp5.csv')" \ + 2>&1| grep "Odd number of consecutive percentage signs" > /dev/null && echo "OK" +if [ -e "${tmp_dir}/tmp5.csv" ]; then echo "tmp5.csv" fi # Clean -rm -rd $CLICKHOUSE_USER_FILES_PATH +rm -rd $tmp_dir From 5e33dd5d5c9eace011d4c729e3c760c017955de9 Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Wed, 17 May 2023 19:14:46 +0000 Subject: [PATCH 0095/1072] Added chmod for tmp dir and files in test --- tests/queries/0_stateless/02732_rename_after_processing.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/queries/0_stateless/02732_rename_after_processing.sh b/tests/queries/0_stateless/02732_rename_after_processing.sh index dbf2427d2dc..4c87070cd35 100755 --- a/tests/queries/0_stateless/02732_rename_after_processing.sh +++ b/tests/queries/0_stateless/02732_rename_after_processing.sh @@ -11,12 +11,18 @@ CLICKHOUSE_USER_FILES_PATH=$(clickhouse-client --query "select _path, _file from unique_name=${CLICKHOUSE_TEST_UNIQUE_NAME} tmp_dir=${CLICKHOUSE_USER_FILES_PATH}/${unique_name} mkdir -p $tmp_dir +rm -rf ${tmp_dir}/* + +chmod 777 ${tmp_dir} + echo '"id","str","int","text"' > ${tmp_dir}/tmp.csv echo '1,"abc",123,"abacaba"' >> ${tmp_dir}/tmp.csv echo '2,"def",456,"bacabaa"' >> ${tmp_dir}/tmp.csv echo '3,"story",78912,"acabaab"' >> ${tmp_dir}/tmp.csv echo '4,"history",21321321,"cabaaba"' >> ${tmp_dir}/tmp.csv +chmod 777 ${tmp_dir}/tmp.csv + cp ${tmp_dir}/tmp.csv ${tmp_dir}/tmp1.csv cp ${tmp_dir}/tmp.csv ${tmp_dir}/tmp2.csv cp ${tmp_dir}/tmp.csv ${tmp_dir}/tmp3_1.csv From 4550a705f97497607a627f5cca8f265b650abdc1 Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Wed, 17 May 2023 19:49:17 +0000 Subject: [PATCH 0096/1072] Fix style --- tests/queries/0_stateless/02732_rename_after_processing.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02732_rename_after_processing.sh b/tests/queries/0_stateless/02732_rename_after_processing.sh index 4c87070cd35..c4f80d3462b 100755 --- a/tests/queries/0_stateless/02732_rename_after_processing.sh +++ b/tests/queries/0_stateless/02732_rename_after_processing.sh @@ -11,7 +11,7 @@ CLICKHOUSE_USER_FILES_PATH=$(clickhouse-client --query "select _path, _file from unique_name=${CLICKHOUSE_TEST_UNIQUE_NAME} tmp_dir=${CLICKHOUSE_USER_FILES_PATH}/${unique_name} mkdir -p $tmp_dir -rm -rf ${tmp_dir}/* +rm -rf ${tmp_dir:?}/* chmod 777 ${tmp_dir} From 900d50633d9c541eff5ac05bdef19cd84e3d6c16 Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Wed, 17 May 2023 20:15:26 +0000 Subject: [PATCH 0097/1072] retrigger checks From 5368355c659ef2f502ea9787593700c1ae03067d Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Fri, 19 May 2023 16:05:51 +0000 Subject: [PATCH 0098/1072] Marked test as broken due to fail of the new analyzer --- tests/broken_tests.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index e61c1316e17..cef8f68b210 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -137,3 +137,4 @@ 01600_parts_types_metrics_long 01287_max_execution_speed 02703_row_policy_for_database +02732_rename_after_processing From c93836b9620f2bd424d5f6132404a455c94a39dd Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 19 May 2023 22:26:53 +0000 Subject: [PATCH 0099/1072] fix --- src/Storages/MergeTree/MergeTreeReadPool.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index 1de5361a7db..4e95a210bae 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -56,6 +56,13 @@ MergeTreeReadPool::MergeTreeReadPool( , backoff_settings{context_->getSettingsRef()} , backoff_state{threads_} { + /// parts don't contain duplicate MergeTreeDataPart's. + const auto per_part_sum_marks = fillPerPartInfo( + parts_ranges, storage_snapshot, is_part_on_remote_disk, + predict_block_size_bytes, + column_names, virtual_column_names, prewhere_info, + actions_settings, reader_settings, per_part_params); + if (std::ranges::count(is_part_on_remote_disk, true)) { const auto & settings = context_->getSettingsRef(); @@ -82,13 +89,6 @@ MergeTreeReadPool::MergeTreeReadPool( } } - /// parts don't contain duplicate MergeTreeDataPart's. - const auto per_part_sum_marks = fillPerPartInfo( - parts_ranges, storage_snapshot, is_part_on_remote_disk, - predict_block_size_bytes, - column_names, virtual_column_names, prewhere_info, - actions_settings, reader_settings, per_part_params); - fillPerThreadInfo(threads_, sum_marks_, per_part_sum_marks, parts_ranges); } From 7189481fad990824fddeee045c97e66d2cd4985c Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 20 May 2023 09:00:28 +0200 Subject: [PATCH 0100/1072] Preserve backward incompatibility for renamed settings by using aliases - optimize_use_projections/allow_experimental_projection_optimization - enable_lightweight_delete/allow_experimental_lightweight_delete Signed-off-by: Azat Khuzhin --- src/Core/Settings.h | 6 ++---- .../0_stateless/02319_lightweight_delete_on_merge_tree.sql | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 21dc267749b..b9e728a3ca4 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -507,7 +507,7 @@ class IColumn; M(Bool, optimize_trivial_count_query, true, "Process trivial 'SELECT count() FROM table' query from metadata.", 0) \ M(Bool, optimize_respect_aliases, true, "If it is set to true, it will respect aliases in WHERE/GROUP BY/ORDER BY, that will help with partition pruning/secondary indexes/optimize_aggregation_in_order/optimize_read_in_order/optimize_trivial_count", 0) \ M(UInt64, mutations_sync, 0, "Wait for synchronous execution of ALTER TABLE UPDATE/DELETE queries (mutations). 0 - execute asynchronously. 1 - wait current server. 2 - wait all replicas if they exist.", 0) \ - M(Bool, enable_lightweight_delete, true, "Enable lightweight DELETE mutations for mergetree tables.", 0) \ + M(Bool, enable_lightweight_delete, true, "Enable lightweight DELETE mutations for mergetree tables.", 0) ALIAS(allow_experimental_lightweight_delete) \ M(Bool, optimize_move_functions_out_of_any, false, "Move functions out of aggregate functions 'any', 'anyLast'.", 0) \ M(Bool, optimize_normalize_count_variants, true, "Rewrite aggregate functions that semantically equals to count() as count().", 0) \ M(Bool, optimize_injective_functions_inside_uniq, true, "Delete injective functions of one argument inside uniq*() functions.", 0) \ @@ -557,7 +557,7 @@ class IColumn; M(Bool, asterisk_include_alias_columns, false, "Include ALIAS columns for wildcard query", 0) \ M(Bool, optimize_skip_merged_partitions, false, "Skip partitions with one part with level > 0 in optimize final", 0) \ M(Bool, optimize_on_insert, true, "Do the same transformation for inserted block of data as if merge was done on this block.", 0) \ - M(Bool, optimize_use_projections, true, "Automatically choose projections to perform SELECT query", 0) \ + M(Bool, optimize_use_projections, true, "Automatically choose projections to perform SELECT query", 0) ALIAS(allow_experimental_projection_optimization) \ M(Bool, force_optimize_projection, false, "If projection optimization is enabled, SELECT queries need to use projection", 0) \ M(Bool, async_socket_for_remote, true, "Asynchronously read from socket executing remote query", 0) \ M(Bool, async_query_sending_for_remote, true, "Asynchronously create connections and send query to shards in remote query", 0) \ @@ -764,7 +764,6 @@ class IColumn; MAKE_OBSOLETE(M, Bool, allow_experimental_database_atomic, true) \ MAKE_OBSOLETE(M, Bool, allow_experimental_bigint_types, true) \ MAKE_OBSOLETE(M, Bool, allow_experimental_window_functions, true) \ - MAKE_OBSOLETE(M, Bool, allow_experimental_lightweight_delete, true) \ MAKE_OBSOLETE(M, Bool, allow_experimental_geo_types, true) \ \ MAKE_OBSOLETE(M, Milliseconds, async_insert_stale_timeout_ms, 0) \ @@ -777,7 +776,6 @@ class IColumn; MAKE_OBSOLETE(M, UInt64, merge_tree_clear_old_parts_interval_seconds, 1) \ MAKE_OBSOLETE(M, UInt64, partial_merge_join_optimizations, 0) \ MAKE_OBSOLETE(M, MaxThreads, max_alter_threads, 0) \ - MAKE_OBSOLETE(M, Bool, allow_experimental_projection_optimization, true) \ MAKE_OBSOLETE(M, Bool, allow_experimental_query_cache, true) \ /* moved to config.xml: see also src/Core/ServerSettings.h */ \ MAKE_DEPRECATED_BY_SERVER_CONFIG(M, UInt64, background_buffer_flush_schedule_pool_size, 16) \ diff --git a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql index 9413c664293..050b8e37722 100644 --- a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql +++ b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql @@ -110,7 +110,7 @@ DROP TABLE t_proj; CREATE TABLE merge_table_standard_delete(id Int32, name String) ENGINE = MergeTree order by id settings min_bytes_for_wide_part=0; SET allow_experimental_lightweight_delete = false; -DELETE FROM merge_table_standard_delete WHERE id = 10; -- allow_experimental_lightweight_delete=false is now ignored +DELETE FROM merge_table_standard_delete WHERE id = 10; -- { serverError SUPPORT_IS_DISABLED } SET enable_lightweight_delete = false; DELETE FROM merge_table_standard_delete WHERE id = 10; -- { serverError SUPPORT_IS_DISABLED } DROP TABLE merge_table_standard_delete; From 24320f8f93f56aa9a7088c4daf80a066facdc5b6 Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Sun, 21 May 2023 15:58:29 +0000 Subject: [PATCH 0101/1072] fixed bad pattern in tests --- .../0_stateless/02722_database_filesystem.sh | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/tests/queries/0_stateless/02722_database_filesystem.sh b/tests/queries/0_stateless/02722_database_filesystem.sh index 80f97af693e..7466141d3e3 100755 --- a/tests/queries/0_stateless/02722_database_filesystem.sh +++ b/tests/queries/0_stateless/02722_database_filesystem.sh @@ -1,5 +1,4 @@ #!/usr/bin/env bash -# Tags: no-parallel CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -9,19 +8,21 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) CLICKHOUSE_USER_FILES_PATH=$(clickhouse-client --query "select _path, _file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') # Prepare data -mkdir -p ${CLICKHOUSE_USER_FILES_PATH}/tmp/ -echo '"id","str","int","text"' > ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv -echo '1,"abc",123,"abacaba"' >> ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv -echo '2,"def",456,"bacabaa"' >> ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv -echo '3,"story",78912,"acabaab"' >> ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv -echo '4,"history",21321321,"cabaaba"' >> ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv +unique_name=${CLICKHOUSE_TEST_UNIQUE_NAME} +user_files_tmp_dir=${CLICKHOUSE_USER_FILES_PATH}/${unique_name} +mkdir -p ${user_files_tmp_dir}/tmp/ +echo '"id","str","int","text"' > ${user_files_tmp_dir}/tmp.csv +echo '1,"abc",123,"abacaba"' >> ${user_files_tmp_dir}/tmp.csv +echo '2,"def",456,"bacabaa"' >> ${user_files_tmp_dir}/tmp.csv +echo '3,"story",78912,"acabaab"' >> ${user_files_tmp_dir}/tmp.csv +echo '4,"history",21321321,"cabaaba"' >> ${user_files_tmp_dir}/tmp.csv tmp_dir=${CLICKHOUSE_TEST_UNIQUE_NAME} [[ -d $tmp_dir ]] && rm -rd $tmp_dir mkdir $tmp_dir -cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${tmp_dir}/tmp.csv -cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp/tmp.csv -cp ${CLICKHOUSE_USER_FILES_PATH}/tmp.csv ${CLICKHOUSE_USER_FILES_PATH}/tmp.myext +cp ${user_files_tmp_dir}/tmp.csv ${tmp_dir}/tmp.csv +cp ${user_files_tmp_dir}/tmp.csv ${user_files_tmp_dir}/tmp/tmp.csv +cp ${user_files_tmp_dir}/tmp.csv ${user_files_tmp_dir}/tmp.myext ################# echo "Test 1: create filesystem database and check implicit calls" @@ -31,8 +32,8 @@ CREATE DATABASE test1 ENGINE = Filesystem; """ echo $? ${CLICKHOUSE_CLIENT} --query "SHOW DATABASES" | grep "test1" -${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`tmp.csv\`;" -${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`tmp/tmp.csv\`;" +${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`${unique_name}/tmp.csv\`;" +${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`${unique_name}/tmp/tmp.csv\`;" ${CLICKHOUSE_LOCAL} -q "SELECT COUNT(*) FROM \"${tmp_dir}/tmp.csv\"" ################# @@ -62,9 +63,9 @@ CREATE DATABASE test2 ENGINE = Filesystem('relative_unknown_dir'); ${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`tmp2.csv\`;" 2>&1| grep -F "Code: 107" > /dev/null && echo "OK" # BAD_ARGUMENTS: Cannot determine the file format by it's extension -${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`tmp.myext\`;" 2>&1| grep -F "Code: 36" > /dev/null && echo "OK" +${CLICKHOUSE_CLIENT} --query "SELECT COUNT(*) FROM test1.\`${unique_name}/tmp.myext\`;" 2>&1| grep -F "Code: 36" > /dev/null && echo "OK" # Clean ${CLICKHOUSE_CLIENT} --query "DROP DATABASE test1;" rm -rd $tmp_dir -rm -rd $CLICKHOUSE_USER_FILES_PATH +rm -rd $user_files_tmp_dir From 183f90e45a7601e5ad4af63b24dabfc506a637ae Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Mon, 22 May 2023 02:02:09 +0000 Subject: [PATCH 0102/1072] Update MongoDB protocol --- .../Foundation/include/Poco/BinaryReader.h | 3 + .../Foundation/include/Poco/BinaryWriter.h | 5 + base/poco/Foundation/src/BinaryReader.cpp | 25 ++ base/poco/Foundation/src/BinaryWriter.cpp | 11 +- base/poco/MongoDB/CMakeLists.txt | 1 + .../poco/MongoDB/include/Poco/MongoDB/Array.h | 36 +- .../MongoDB/include/Poco/MongoDB/Binary.h | 2 +- .../MongoDB/include/Poco/MongoDB/Connection.h | 22 +- .../MongoDB/include/Poco/MongoDB/Cursor.h | 3 + .../MongoDB/include/Poco/MongoDB/Database.h | 83 +++- .../MongoDB/include/Poco/MongoDB/Document.h | 27 +- .../MongoDB/include/Poco/MongoDB/Element.h | 6 +- .../include/Poco/MongoDB/JavaScriptCode.h | 2 +- .../include/Poco/MongoDB/MessageHeader.h | 11 +- .../MongoDB/include/Poco/MongoDB/MongoDB.h | 12 + .../MongoDB/include/Poco/MongoDB/ObjectId.h | 2 +- .../include/Poco/MongoDB/OpMsgCursor.h | 96 ++++ .../include/Poco/MongoDB/OpMsgMessage.h | 163 +++++++ .../Poco/MongoDB/PoolableConnectionFactory.h | 16 + .../include/Poco/MongoDB/RegularExpression.h | 2 +- .../include/Poco/MongoDB/ResponseMessage.h | 3 + base/poco/MongoDB/src/Array.cpp | 4 +- base/poco/MongoDB/src/Connection.cpp | 26 ++ base/poco/MongoDB/src/Cursor.cpp | 6 + base/poco/MongoDB/src/Database.cpp | 48 +- base/poco/MongoDB/src/DeleteRequest.cpp | 4 +- base/poco/MongoDB/src/Document.cpp | 14 +- base/poco/MongoDB/src/Element.cpp | 2 +- base/poco/MongoDB/src/GetMoreRequest.cpp | 2 +- base/poco/MongoDB/src/InsertRequest.cpp | 2 +- base/poco/MongoDB/src/KillCursorsRequest.cpp | 2 +- base/poco/MongoDB/src/Message.cpp | 2 +- base/poco/MongoDB/src/MessageHeader.cpp | 12 +- base/poco/MongoDB/src/ObjectId.cpp | 2 +- base/poco/MongoDB/src/OpMsgCursor.cpp | 187 ++++++++ base/poco/MongoDB/src/OpMsgMessage.cpp | 412 ++++++++++++++++++ base/poco/MongoDB/src/QueryRequest.cpp | 6 +- base/poco/MongoDB/src/RegularExpression.cpp | 4 +- base/poco/MongoDB/src/ReplicaSet.cpp | 6 +- base/poco/MongoDB/src/RequestMessage.cpp | 4 +- base/poco/MongoDB/src/ResponseMessage.cpp | 20 +- base/poco/MongoDB/src/UpdateRequest.cpp | 2 +- .../runner/compose/docker_compose_mongo.yml | 2 +- .../compose/docker_compose_mongo_secure.yml | 2 +- src/Dictionaries/MongoDBDictionarySource.cpp | 15 +- src/Dictionaries/MongoDBDictionarySource.h | 1 - src/Processors/Sources/MongoDBSource.cpp | 76 +++- src/Processors/Sources/MongoDBSource.h | 32 +- src/Storages/StorageMongoDB.cpp | 34 +- .../integration/test_storage_mongodb/test.py | 42 +- 50 files changed, 1399 insertions(+), 103 deletions(-) create mode 100644 base/poco/MongoDB/include/Poco/MongoDB/OpMsgCursor.h create mode 100644 base/poco/MongoDB/include/Poco/MongoDB/OpMsgMessage.h create mode 100644 base/poco/MongoDB/src/OpMsgCursor.cpp create mode 100644 base/poco/MongoDB/src/OpMsgMessage.cpp diff --git a/base/poco/Foundation/include/Poco/BinaryReader.h b/base/poco/Foundation/include/Poco/BinaryReader.h index 4042b507a2f..2b9bca29944 100644 --- a/base/poco/Foundation/include/Poco/BinaryReader.h +++ b/base/poco/Foundation/include/Poco/BinaryReader.h @@ -117,6 +117,9 @@ public: void readRaw(char * buffer, std::streamsize length); /// Reads length bytes of raw data into buffer. + void readCString(std::string& value); + /// Reads zero-terminated C-string into value. + void readBOM(); /// Reads a byte-order mark from the stream and configures /// the reader for the encountered byte order. diff --git a/base/poco/Foundation/include/Poco/BinaryWriter.h b/base/poco/Foundation/include/Poco/BinaryWriter.h index aa280d4ccab..a35d76d84bc 100644 --- a/base/poco/Foundation/include/Poco/BinaryWriter.h +++ b/base/poco/Foundation/include/Poco/BinaryWriter.h @@ -56,6 +56,8 @@ public: LITTLE_ENDIAN_BYTE_ORDER = 3 /// little-endian byte-order }; + static const std::streamsize DEFAULT_MAX_CSTR_LENGTH { 1024 }; + BinaryWriter(std::ostream & ostr, StreamByteOrder byteOrder = NATIVE_BYTE_ORDER); /// Creates the BinaryWriter. @@ -131,6 +133,9 @@ public: void writeRaw(const char * buffer, std::streamsize length); /// Writes length raw bytes from the given buffer to the stream. + void writeCString(const char* cString, std::streamsize maxLength = DEFAULT_MAX_CSTR_LENGTH); + /// Writes zero-terminated C-string. + void writeBOM(); /// Writes a byte-order mark to the stream. A byte order mark is /// a 16-bit integer with a value of 0xFEFF, written in host byte-order. diff --git a/base/poco/Foundation/src/BinaryReader.cpp b/base/poco/Foundation/src/BinaryReader.cpp index f2961e03966..37ec2bc9040 100644 --- a/base/poco/Foundation/src/BinaryReader.cpp +++ b/base/poco/Foundation/src/BinaryReader.cpp @@ -274,6 +274,31 @@ void BinaryReader::readRaw(char* buffer, std::streamsize length) } +void BinaryReader::readCString(std::string& value) +{ + value.clear(); + if (!_istr.good()) + { + return; + } + value.reserve(256); + while (true) + { + char c; + _istr.get(c); + if (!_istr.good()) + { + break; + } + if (c == '\0') + { + break; + } + value += c; + } +} + + void BinaryReader::readBOM() { UInt16 bom; diff --git a/base/poco/Foundation/src/BinaryWriter.cpp b/base/poco/Foundation/src/BinaryWriter.cpp index 6db5ab7cb90..c3fcabc4374 100644 --- a/base/poco/Foundation/src/BinaryWriter.cpp +++ b/base/poco/Foundation/src/BinaryWriter.cpp @@ -271,7 +271,7 @@ BinaryWriter& BinaryWriter::operator << (const std::string& value) BinaryWriter& BinaryWriter::operator << (const char* value) { poco_check_ptr (value); - + if (_pTextConverter) { std::string converted; @@ -332,6 +332,15 @@ void BinaryWriter::writeRaw(const char* buffer, std::streamsize length) } +void BinaryWriter::writeCString(const char* cString, std::streamsize maxLength) +{ + const std::size_t len = ::strnlen(cString, maxLength); + writeRaw(cString, len); + static const char zero = '\0'; + _ostr.write(&zero, sizeof(zero)); +} + + void BinaryWriter::writeBOM() { UInt16 value = 0xFEFF; diff --git a/base/poco/MongoDB/CMakeLists.txt b/base/poco/MongoDB/CMakeLists.txt index 8f5c6be2cae..bb6f90ed8f5 100644 --- a/base/poco/MongoDB/CMakeLists.txt +++ b/base/poco/MongoDB/CMakeLists.txt @@ -13,3 +13,4 @@ target_compile_options (_poco_mongodb target_include_directories (_poco_mongodb SYSTEM PUBLIC "include") target_link_libraries (_poco_mongodb PUBLIC Poco::Net) + diff --git a/base/poco/MongoDB/include/Poco/MongoDB/Array.h b/base/poco/MongoDB/include/Poco/MongoDB/Array.h index 4ed9cdd87ee..8a30c785b2d 100644 --- a/base/poco/MongoDB/include/Poco/MongoDB/Array.h +++ b/base/poco/MongoDB/include/Poco/MongoDB/Array.h @@ -33,7 +33,7 @@ namespace MongoDB /// This class represents a BSON Array. { public: - typedef SharedPtr Ptr; + using Ptr = SharedPtr; Array(); /// Creates an empty Array. @@ -41,8 +41,31 @@ namespace MongoDB virtual ~Array(); /// Destroys the Array. + // Document template functions available for backward compatibility + using Document::add; + using Document::get; + template - T get(int pos) const + Document & add(T value) + /// Creates an element with the name from the current pos and value and + /// adds it to the array document. + /// + /// The active document is returned to allow chaining of the add methods. + { + return Document::add(Poco::NumberFormatter::format(size()), value); + } + + Document & add(const char * value) + /// Creates an element with a name from the current pos and value and + /// adds it to the array document. + /// + /// The active document is returned to allow chaining of the add methods. + { + return Document::add(Poco::NumberFormatter::format(size()), value); + } + + template + T get(std::size_t pos) const /// Returns the element at the given index and tries to convert /// it to the template type. If the element is not found, a /// Poco::NotFoundException will be thrown. If the element cannot be @@ -52,7 +75,7 @@ namespace MongoDB } template - T get(int pos, const T & deflt) const + T get(std::size_t pos, const T & deflt) const /// Returns the element at the given index and tries to convert /// it to the template type. If the element is not found, or /// has the wrong type, the deflt argument will be returned. @@ -60,12 +83,12 @@ namespace MongoDB return Document::get(Poco::NumberFormatter::format(pos), deflt); } - Element::Ptr get(int pos) const; + Element::Ptr get(std::size_t pos) const; /// Returns the element at the given index. /// An empty element will be returned if the element is not found. template - bool isType(int pos) const + bool isType(std::size_t pos) const /// Returns true if the type of the element equals the TypeId of ElementTrait, /// otherwise false. { @@ -74,6 +97,9 @@ namespace MongoDB std::string toString(int indent = 0) const; /// Returns a string representation of the Array. + + private: + friend void BSONReader::read(Array::Ptr & to); }; diff --git a/base/poco/MongoDB/include/Poco/MongoDB/Binary.h b/base/poco/MongoDB/include/Poco/MongoDB/Binary.h index 1005cb000f5..aad8736e8b6 100644 --- a/base/poco/MongoDB/include/Poco/MongoDB/Binary.h +++ b/base/poco/MongoDB/include/Poco/MongoDB/Binary.h @@ -40,7 +40,7 @@ namespace MongoDB /// A Binary stores its data in a Poco::Buffer. { public: - typedef SharedPtr Ptr; + using Ptr = SharedPtr; Binary(); /// Creates an empty Binary with subtype 0. diff --git a/base/poco/MongoDB/include/Poco/MongoDB/Connection.h b/base/poco/MongoDB/include/Poco/MongoDB/Connection.h index dcb813b75bc..cf679d530aa 100644 --- a/base/poco/MongoDB/include/Poco/MongoDB/Connection.h +++ b/base/poco/MongoDB/include/Poco/MongoDB/Connection.h @@ -18,6 +18,7 @@ #define MongoDB_Connection_INCLUDED +#include "Poco/MongoDB/OpMsgMessage.h" #include "Poco/MongoDB/RequestMessage.h" #include "Poco/MongoDB/ResponseMessage.h" #include "Poco/Mutex.h" @@ -39,7 +40,7 @@ namespace MongoDB /// for more information on the wire protocol. { public: - typedef Poco::SharedPtr Ptr; + using Ptr = Poco::SharedPtr; class MongoDB_API SocketFactory { @@ -90,7 +91,7 @@ namespace MongoDB Poco::Net::SocketAddress address() const; /// Returns the address of the MongoDB server. - + const std::string & uri() const; /// Returns the uri on which the connection was made. @@ -145,6 +146,21 @@ namespace MongoDB /// Use this when a response is expected: only a "query" or "getmore" /// request will return a response. + void sendRequest(OpMsgMessage & request, OpMsgMessage & response); + /// Sends a request to the MongoDB server and receives the response + /// using newer wire protocol with OP_MSG. + + void sendRequest(OpMsgMessage & request); + /// Sends an unacknowledged request to the MongoDB server using newer + /// wire protocol with OP_MSG. + /// No response is sent by the server. + + void readResponse(OpMsgMessage & response); + /// Reads additional response data when previous message's flag moreToCome + /// indicates that server will send more data. + /// NOTE: See comments in OpMsgCursor code. + + protected: void connect(); @@ -164,7 +180,7 @@ namespace MongoDB } inline const std::string & Connection::uri() const { - return _uri; + return _uri; } diff --git a/base/poco/MongoDB/include/Poco/MongoDB/Cursor.h b/base/poco/MongoDB/include/Poco/MongoDB/Cursor.h index 4aed9fe64fb..8849d737a62 100644 --- a/base/poco/MongoDB/include/Poco/MongoDB/Cursor.h +++ b/base/poco/MongoDB/include/Poco/MongoDB/Cursor.h @@ -40,6 +40,9 @@ namespace MongoDB Cursor(const std::string & fullCollectionName, QueryRequest::Flags flags = QueryRequest::QUERY_DEFAULT); /// Creates a Cursor for the given database and collection ("database.collection"), using the specified flags. + Cursor(const Document & aggregationResponse); + /// Creates a Cursor for the given aggregation query response. + virtual ~Cursor(); /// Destroys the Cursor. diff --git a/base/poco/MongoDB/include/Poco/MongoDB/Database.h b/base/poco/MongoDB/include/Poco/MongoDB/Database.h index 62aea632b08..3334a673df6 100644 --- a/base/poco/MongoDB/include/Poco/MongoDB/Database.h +++ b/base/poco/MongoDB/include/Poco/MongoDB/Database.h @@ -26,6 +26,8 @@ #include "Poco/MongoDB/QueryRequest.h" #include "Poco/MongoDB/UpdateRequest.h" +#include "Poco/MongoDB/OpMsgCursor.h" +#include "Poco/MongoDB/OpMsgMessage.h" namespace Poco { @@ -45,6 +47,9 @@ namespace MongoDB virtual ~Database(); /// Destroys the Database. + const std::string & name() const; + /// Database name + bool authenticate( Connection & connection, const std::string & username, @@ -62,34 +67,49 @@ namespace MongoDB /// May throw a Poco::ProtocolException if authentication fails for a reason other than /// invalid credentials. + Document::Ptr queryBuildInfo(Connection & connection) const; + /// Queries server build info (all wire protocols) + + Document::Ptr queryServerHello(Connection & connection) const; + /// Queries hello response from server (all wire protocols) + Int64 count(Connection & connection, const std::string & collectionName) const; - /// Sends a count request for the given collection to MongoDB. + /// Sends a count request for the given collection to MongoDB. (old wire protocol) /// /// If the command fails, -1 is returned. Poco::SharedPtr createCommand() const; - /// Creates a QueryRequest for a command. + /// Creates a QueryRequest for a command. (old wire protocol) Poco::SharedPtr createCountRequest(const std::string & collectionName) const; /// Creates a QueryRequest to count the given collection. - /// The collectionname must not contain the database name. + /// The collectionname must not contain the database name. (old wire protocol) Poco::SharedPtr createDeleteRequest(const std::string & collectionName) const; /// Creates a DeleteRequest to delete documents in the given collection. - /// The collectionname must not contain the database name. + /// The collectionname must not contain the database name. (old wire protocol) Poco::SharedPtr createInsertRequest(const std::string & collectionName) const; /// Creates an InsertRequest to insert new documents in the given collection. - /// The collectionname must not contain the database name. + /// The collectionname must not contain the database name. (old wire protocol) Poco::SharedPtr createQueryRequest(const std::string & collectionName) const; - /// Creates a QueryRequest. + /// Creates a QueryRequest. (old wire protocol) /// The collectionname must not contain the database name. Poco::SharedPtr createUpdateRequest(const std::string & collectionName) const; - /// Creates an UpdateRequest. + /// Creates an UpdateRequest. (old wire protocol) /// The collectionname must not contain the database name. + Poco::SharedPtr createOpMsgMessage(const std::string & collectionName) const; + /// Creates OpMsgMessage. (new wire protocol) + + Poco::SharedPtr createOpMsgMessage() const; + /// Creates OpMsgMessage for database commands that do not require collection as an argument. (new wire protocol) + + Poco::SharedPtr createOpMsgCursor(const std::string & collectionName) const; + /// Creates OpMsgCursor. (new wire protocol) + Poco::MongoDB::Document::Ptr ensureIndex( Connection & connection, const std::string & collection, @@ -100,14 +120,16 @@ namespace MongoDB int version = 0, int ttl = 0); /// Creates an index. The document returned is the result of a getLastError call. - /// For more info look at the ensureIndex information on the MongoDB website. + /// For more info look at the ensureIndex information on the MongoDB website. (old wire protocol) Document::Ptr getLastErrorDoc(Connection & connection) const; /// Sends the getLastError command to the database and returns the error document. + /// (old wire protocol) std::string getLastError(Connection & connection) const; /// Sends the getLastError command to the database and returns the err element /// from the error document. When err is null, an empty string is returned. + /// (old wire protocol) static const std::string AUTH_MONGODB_CR; /// Default authentication mechanism prior to MongoDB 3.0. @@ -115,6 +137,27 @@ namespace MongoDB static const std::string AUTH_SCRAM_SHA1; /// Default authentication mechanism for MongoDB 3.0. + enum WireVersion + /// Wire version as reported by the command hello. + /// See details in MongoDB github, repository specifications. + /// @see queryServerHello + { + VER_26 = 1, + VER_26_2 = 2, + VER_30 = 3, + VER_32 = 4, + VER_34 = 5, + VER_36 = 6, ///< First wire version that supports OP_MSG + VER_40 = 7, + VER_42 = 8, + VER_44 = 9, + VER_50 = 13, + VER_51 = 14, ///< First wire version that supports only OP_MSG + VER_52 = 15, + VER_53 = 16, + VER_60 = 17 + }; + protected: bool authCR(Connection & connection, const std::string & username, const std::string & password); bool authSCRAM(Connection & connection, const std::string & username, const std::string & password); @@ -127,6 +170,12 @@ namespace MongoDB // // inlines // + inline const std::string & Database::name() const + { + return _dbname; + } + + inline Poco::SharedPtr Database::createCommand() const { Poco::SharedPtr cmd = createQueryRequest("$cmd"); @@ -158,6 +207,24 @@ namespace MongoDB return new Poco::MongoDB::UpdateRequest(_dbname + '.' + collectionName); } + // -- New wire protocol commands + + inline Poco::SharedPtr Database::createOpMsgMessage(const std::string & collectionName) const + { + return new Poco::MongoDB::OpMsgMessage(_dbname, collectionName); + } + + inline Poco::SharedPtr Database::createOpMsgMessage() const + { + // Collection name for database commands is not needed. + return createOpMsgMessage(""); + } + + inline Poco::SharedPtr Database::createOpMsgCursor(const std::string & collectionName) const + { + return new Poco::MongoDB::OpMsgCursor(_dbname, collectionName); + } + } } // namespace Poco::MongoDB diff --git a/base/poco/MongoDB/include/Poco/MongoDB/Document.h b/base/poco/MongoDB/include/Poco/MongoDB/Document.h index 12889663827..9e1df349e20 100644 --- a/base/poco/MongoDB/include/Poco/MongoDB/Document.h +++ b/base/poco/MongoDB/include/Poco/MongoDB/Document.h @@ -31,6 +31,7 @@ namespace Poco namespace MongoDB { + class Array; class ElementFindByName { @@ -48,8 +49,8 @@ namespace MongoDB /// Represents a MongoDB (BSON) document. { public: - typedef SharedPtr Ptr; - typedef std::vector Vector; + using Ptr = SharedPtr; + using Vector = std::vector; Document(); /// Creates an empty Document. @@ -86,6 +87,10 @@ namespace MongoDB /// Unlike the other add methods, this method returns /// a reference to the new document. + Array & addNewArray(const std::string & name); + /// Create a new array and add it to this document. + /// Method returns a reference to the new array. + void clear(); /// Removes all elements from the document. @@ -95,7 +100,7 @@ namespace MongoDB bool empty() const; /// Returns true if the document doesn't contain any documents. - bool exists(const std::string & name); + bool exists(const std::string & name) const; /// Returns true if the document has an element with the given name. template @@ -158,6 +163,9 @@ namespace MongoDB /// return an Int64. When the element is not found, a /// Poco::NotFoundException will be thrown. + bool remove(const std::string & name); + /// Removes an element from the document. + template bool isType(const std::string & name) const /// Returns true when the type of the element equals the TypeId of ElementTrait. @@ -227,12 +235,23 @@ namespace MongoDB } - inline bool Document::exists(const std::string & name) + inline bool Document::exists(const std::string & name) const { return std::find_if(_elements.begin(), _elements.end(), ElementFindByName(name)) != _elements.end(); } + inline bool Document::remove(const std::string & name) + { + auto it = std::find_if(_elements.begin(), _elements.end(), ElementFindByName(name)); + if (it == _elements.end()) + return false; + + _elements.erase(it); + return true; + } + + inline std::size_t Document::size() const { return _elements.size(); diff --git a/base/poco/MongoDB/include/Poco/MongoDB/Element.h b/base/poco/MongoDB/include/Poco/MongoDB/Element.h index b5592bd0e0b..26525d7d02b 100644 --- a/base/poco/MongoDB/include/Poco/MongoDB/Element.h +++ b/base/poco/MongoDB/include/Poco/MongoDB/Element.h @@ -45,7 +45,7 @@ namespace MongoDB /// Represents an Element of a Document or an Array. { public: - typedef Poco::SharedPtr Ptr; + using Ptr = Poco::SharedPtr; explicit Element(const std::string & name); /// Creates the Element with the given name. @@ -80,7 +80,7 @@ namespace MongoDB } - typedef std::list ElementSet; + using ElementSet = std::list; template @@ -266,7 +266,7 @@ namespace MongoDB } - typedef Nullable NullValue; + using NullValue = Nullable; // BSON Null Value diff --git a/base/poco/MongoDB/include/Poco/MongoDB/JavaScriptCode.h b/base/poco/MongoDB/include/Poco/MongoDB/JavaScriptCode.h index df1edc16817..c0f584b7c19 100644 --- a/base/poco/MongoDB/include/Poco/MongoDB/JavaScriptCode.h +++ b/base/poco/MongoDB/include/Poco/MongoDB/JavaScriptCode.h @@ -35,7 +35,7 @@ namespace MongoDB /// Represents JavaScript type in BSON. { public: - typedef SharedPtr Ptr; + using Ptr = SharedPtr; JavaScriptCode(); /// Creates an empty JavaScriptCode object. diff --git a/base/poco/MongoDB/include/Poco/MongoDB/MessageHeader.h b/base/poco/MongoDB/include/Poco/MongoDB/MessageHeader.h index 2b88e30fc74..98f45e876c1 100644 --- a/base/poco/MongoDB/include/Poco/MongoDB/MessageHeader.h +++ b/base/poco/MongoDB/include/Poco/MongoDB/MessageHeader.h @@ -28,6 +28,9 @@ namespace MongoDB { + class Message; // Required to disambiguate friend declaration in MessageHeader. + + class MongoDB_API MessageHeader /// Represents the message header which is always prepended to a /// MongoDB request or response message. @@ -37,14 +40,18 @@ namespace MongoDB enum OpCode { + // Opcodes deprecated in MongoDB 5.0 OP_REPLY = 1, - OP_MSG = 1000, OP_UPDATE = 2001, OP_INSERT = 2002, OP_QUERY = 2004, OP_GET_MORE = 2005, OP_DELETE = 2006, - OP_KILL_CURSORS = 2007 + OP_KILL_CURSORS = 2007, + + /// Opcodes supported in MongoDB 5.1 and later + OP_COMPRESSED = 2012, + OP_MSG = 2013 }; explicit MessageHeader(OpCode); diff --git a/base/poco/MongoDB/include/Poco/MongoDB/MongoDB.h b/base/poco/MongoDB/include/Poco/MongoDB/MongoDB.h index 253f1f8ab27..de246ddc9dd 100644 --- a/base/poco/MongoDB/include/Poco/MongoDB/MongoDB.h +++ b/base/poco/MongoDB/include/Poco/MongoDB/MongoDB.h @@ -33,6 +33,13 @@ // +#if defined(_WIN32) && defined(POCO_DLL) +# if defined(MongoDB_EXPORTS) +# define MongoDB_API __declspec(dllexport) +# else +# define MongoDB_API __declspec(dllimport) +# endif +#endif #if !defined(MongoDB_API) @@ -47,6 +54,11 @@ // // Automatically link MongoDB library. // +#if defined(_MSC_VER) +# if !defined(POCO_NO_AUTOMATIC_LIBS) && !defined(MongoDB_EXPORTS) +# pragma comment(lib, "PocoMongoDB" POCO_LIB_SUFFIX) +# endif +#endif #endif // MongoDBMongoDB_INCLUDED diff --git a/base/poco/MongoDB/include/Poco/MongoDB/ObjectId.h b/base/poco/MongoDB/include/Poco/MongoDB/ObjectId.h index 76bb190db48..8a335320ea0 100644 --- a/base/poco/MongoDB/include/Poco/MongoDB/ObjectId.h +++ b/base/poco/MongoDB/include/Poco/MongoDB/ObjectId.h @@ -44,7 +44,7 @@ namespace MongoDB /// as its value. { public: - typedef SharedPtr Ptr; + using Ptr = SharedPtr; explicit ObjectId(const std::string & id); /// Creates an ObjectId from a string. diff --git a/base/poco/MongoDB/include/Poco/MongoDB/OpMsgCursor.h b/base/poco/MongoDB/include/Poco/MongoDB/OpMsgCursor.h new file mode 100644 index 00000000000..a465a71bb1c --- /dev/null +++ b/base/poco/MongoDB/include/Poco/MongoDB/OpMsgCursor.h @@ -0,0 +1,96 @@ +// +// OpMsgCursor.h +// +// Library: MongoDB +// Package: MongoDB +// Module: OpMsgCursor +// +// Definition of the OpMsgCursor class. +// +// Copyright (c) 2012, Applied Informatics Software Engineering GmbH. +// and Contributors. +// +// SPDX-License-Identifier: BSL-1.0 +// + + +#ifndef MongoDB_OpMsgCursor_INCLUDED +#define MongoDB_OpMsgCursor_INCLUDED + + +#include "Poco/MongoDB/Connection.h" +#include "Poco/MongoDB/MongoDB.h" +#include "Poco/MongoDB/OpMsgMessage.h" + +namespace Poco +{ +namespace MongoDB +{ + + + class MongoDB_API OpMsgCursor : public Document + /// OpMsgCursor is an helper class for querying multiple documents using OpMsgMessage. + { + public: + OpMsgCursor(const std::string & dbname, const std::string & collectionName); + /// Creates a OpMsgCursor for the given database and collection. + + virtual ~OpMsgCursor(); + /// Destroys the OpMsgCursor. + + void setEmptyFirstBatch(bool empty); + /// Empty first batch is used to get error response faster with little server processing + + bool emptyFirstBatch() const; + + void setBatchSize(Int32 batchSize); + /// Set non-default batch size + + Int32 batchSize() const; + /// Current batch size (zero or negative number indicates default batch size) + + Int64 cursorID() const; + + OpMsgMessage & next(Connection & connection); + /// Tries to get the next documents. As long as response message has a + /// cursor ID next can be called to retrieve the next bunch of documents. + /// + /// The cursor must be killed (see kill()) when not all documents are needed. + + OpMsgMessage & query(); + /// Returns the associated query. + + void kill(Connection & connection); + /// Kills the cursor and reset it so that it can be reused. + + private: + OpMsgMessage _query; + OpMsgMessage _response; + + bool _emptyFirstBatch{false}; + Int32 _batchSize{-1}; + /// Batch size used in the cursor. Zero or negative value means that default shall be used. + + Int64 _cursorID{0}; + }; + + + // + // inlines + // + inline OpMsgMessage & OpMsgCursor::query() + { + return _query; + } + + inline Int64 OpMsgCursor::cursorID() const + { + return _cursorID; + } + + +} +} // namespace Poco::MongoDB + + +#endif // MongoDB_OpMsgCursor_INCLUDED diff --git a/base/poco/MongoDB/include/Poco/MongoDB/OpMsgMessage.h b/base/poco/MongoDB/include/Poco/MongoDB/OpMsgMessage.h new file mode 100644 index 00000000000..699c7fc4e12 --- /dev/null +++ b/base/poco/MongoDB/include/Poco/MongoDB/OpMsgMessage.h @@ -0,0 +1,163 @@ +// +// OpMsgMessage.h +// +// Library: MongoDB +// Package: MongoDB +// Module: OpMsgMessage +// +// Definition of the OpMsgMessage class. +// +// Copyright (c) 2022, Applied Informatics Software Engineering GmbH. +// and Contributors. +// +// SPDX-License-Identifier: BSL-1.0 +// + + +#ifndef MongoDB_OpMsgMessage_INCLUDED +#define MongoDB_OpMsgMessage_INCLUDED + + +#include "Poco/MongoDB/Document.h" +#include "Poco/MongoDB/Message.h" +#include "Poco/MongoDB/MongoDB.h" + +#include + +namespace Poco +{ +namespace MongoDB +{ + + + class MongoDB_API OpMsgMessage : public Message + /// This class represents a request/response (OP_MSG) to send requests and receive responses to/from MongoDB. + { + public: + // Constants for most often used MongoDB commands that can be sent using OP_MSG + // For complete list see: https://www.mongodb.com/docs/manual/reference/command/ + + // Query and write + static const std::string CMD_INSERT; + static const std::string CMD_DELETE; + static const std::string CMD_UPDATE; + static const std::string CMD_FIND; + static const std::string CMD_FIND_AND_MODIFY; + static const std::string CMD_GET_MORE; + + // Aggregation + static const std::string CMD_AGGREGATE; + static const std::string CMD_COUNT; + static const std::string CMD_DISTINCT; + static const std::string CMD_MAP_REDUCE; + + // Replication and administration + static const std::string CMD_HELLO; + static const std::string CMD_REPL_SET_GET_STATUS; + static const std::string CMD_REPL_SET_GET_CONFIG; + + static const std::string CMD_CREATE; + static const std::string CMD_CREATE_INDEXES; + static const std::string CMD_DROP; + static const std::string CMD_DROP_DATABASE; + static const std::string CMD_KILL_CURSORS; + static const std::string CMD_LIST_DATABASES; + static const std::string CMD_LIST_INDEXES; + + // Diagnostic + static const std::string CMD_BUILD_INFO; + static const std::string CMD_COLL_STATS; + static const std::string CMD_DB_STATS; + static const std::string CMD_HOST_INFO; + + + enum Flags : UInt32 + { + MSG_FLAGS_DEFAULT = 0, + + MSG_CHECKSUM_PRESENT = (1 << 0), + + MSG_MORE_TO_COME = (1 << 1), + /// Sender will send another message and is not prepared for overlapping messages + + MSG_EXHAUST_ALLOWED = (1 << 16) + /// Client is prepared for multiple replies (using the moreToCome bit) to this request + }; + + OpMsgMessage(); + /// Creates an OpMsgMessage for response. + + OpMsgMessage(const std::string & databaseName, const std::string & collectionName, UInt32 flags = MSG_FLAGS_DEFAULT); + /// Creates an OpMsgMessage for requests. + + virtual ~OpMsgMessage(); + + const std::string & databaseName() const; + + const std::string & collectionName() const; + + void setCommandName(const std::string & command); + /// Sets the command name and clears the command document + + void setCursor(Poco::Int64 cursorID, Poco::Int32 batchSize = -1); + /// Sets the command "getMore" for the cursor id with batch size (if it is not negative). + + const std::string & commandName() const; + /// Current command name. + + void setAcknowledgedRequest(bool ack); + /// Set false to create request that does not return response. + /// It has effect only for commands that write or delete documents. + /// Default is true (request returns acknowledge response). + + bool acknowledgedRequest() const; + + UInt32 flags() const; + + Document & body(); + /// Access to body document. + /// Additional query arguments shall be added after setting the command name. + + const Document & body() const; + + Document::Vector & documents(); + /// Documents prepared for request or retrieved in response. + + const Document::Vector & documents() const; + /// Documents prepared for request or retrieved in response. + + bool responseOk() const; + /// Reads "ok" status from the response message. + + void clear(); + /// Clears the message. + + void send(std::ostream & ostr); + /// Writes the request to stream. + + void read(std::istream & istr); + /// Reads the response from the stream. + + private: + enum PayloadType : UInt8 + { + PAYLOAD_TYPE_0 = 0, + PAYLOAD_TYPE_1 = 1 + }; + + std::string _databaseName; + std::string _collectionName; + UInt32 _flags{MSG_FLAGS_DEFAULT}; + std::string _commandName; + bool _acknowledged{true}; + + Document _body; + Document::Vector _documents; + }; + + +} +} // namespace Poco::MongoDB + + +#endif // MongoDB_OpMsgMessage_INCLUDED diff --git a/base/poco/MongoDB/include/Poco/MongoDB/PoolableConnectionFactory.h b/base/poco/MongoDB/include/Poco/MongoDB/PoolableConnectionFactory.h index 9d35c728e5e..53f4a5127ef 100644 --- a/base/poco/MongoDB/include/Poco/MongoDB/PoolableConnectionFactory.h +++ b/base/poco/MongoDB/include/Poco/MongoDB/PoolableConnectionFactory.h @@ -94,7 +94,23 @@ namespace MongoDB operator Connection::Ptr() { return _connection; } +#if defined(POCO_ENABLE_CPP11) + // Disable copy to prevent unwanted release of resources: C++11 way + PooledConnection(const PooledConnection &) = delete; + PooledConnection & operator=(const PooledConnection &) = delete; + + // Enable move semantics + PooledConnection(PooledConnection && other) = default; + PooledConnection & operator=(PooledConnection &&) = default; +#endif + private: +#if !defined(POCO_ENABLE_CPP11) + // Disable copy to prevent unwanted release of resources: pre C++11 way + PooledConnection(const PooledConnection &); + PooledConnection & operator=(const PooledConnection &); +#endif + Poco::ObjectPool & _pool; Connection::Ptr _connection; }; diff --git a/base/poco/MongoDB/include/Poco/MongoDB/RegularExpression.h b/base/poco/MongoDB/include/Poco/MongoDB/RegularExpression.h index b9a8694d321..244b8c14163 100644 --- a/base/poco/MongoDB/include/Poco/MongoDB/RegularExpression.h +++ b/base/poco/MongoDB/include/Poco/MongoDB/RegularExpression.h @@ -33,7 +33,7 @@ namespace MongoDB /// Represents a regular expression in BSON format. { public: - typedef SharedPtr Ptr; + using Ptr = SharedPtr; RegularExpression(); /// Creates an empty RegularExpression. diff --git a/base/poco/MongoDB/include/Poco/MongoDB/ResponseMessage.h b/base/poco/MongoDB/include/Poco/MongoDB/ResponseMessage.h index 132859cc75f..9cb92cb16c4 100644 --- a/base/poco/MongoDB/include/Poco/MongoDB/ResponseMessage.h +++ b/base/poco/MongoDB/include/Poco/MongoDB/ResponseMessage.h @@ -38,6 +38,9 @@ namespace MongoDB ResponseMessage(); /// Creates an empty ResponseMessage. + ResponseMessage(const Int64 & cursorID); + /// Creates an ResponseMessage for existing cursor ID. + virtual ~ResponseMessage(); /// Destroys the ResponseMessage. diff --git a/base/poco/MongoDB/src/Array.cpp b/base/poco/MongoDB/src/Array.cpp index c6d96d1371d..6fff0994d82 100644 --- a/base/poco/MongoDB/src/Array.cpp +++ b/base/poco/MongoDB/src/Array.cpp @@ -20,7 +20,7 @@ namespace Poco { namespace MongoDB { -Array::Array(): +Array::Array(): Document() { } @@ -31,7 +31,7 @@ Array::~Array() } -Element::Ptr Array::get(int pos) const +Element::Ptr Array::get(std::size_t pos) const { std::string name = Poco::NumberFormatter::format(pos); return Document::get(name); diff --git a/base/poco/MongoDB/src/Connection.cpp b/base/poco/MongoDB/src/Connection.cpp index 38c31d2250a..fa20887054b 100644 --- a/base/poco/MongoDB/src/Connection.cpp +++ b/base/poco/MongoDB/src/Connection.cpp @@ -319,4 +319,30 @@ void Connection::sendRequest(RequestMessage& request, ResponseMessage& response) } +void Connection::sendRequest(OpMsgMessage& request, OpMsgMessage& response) +{ + Poco::Net::SocketOutputStream sos(_socket); + request.send(sos); + + response.clear(); + readResponse(response); +} + + +void Connection::sendRequest(OpMsgMessage& request) +{ + request.setAcknowledgedRequest(false); + Poco::Net::SocketOutputStream sos(_socket); + request.send(sos); +} + + +void Connection::readResponse(OpMsgMessage& response) +{ + Poco::Net::SocketInputStream sis(_socket); + response.read(sis); +} + + + } } // Poco::MongoDB diff --git a/base/poco/MongoDB/src/Cursor.cpp b/base/poco/MongoDB/src/Cursor.cpp index 69031e0ab65..ef7a4ca961d 100644 --- a/base/poco/MongoDB/src/Cursor.cpp +++ b/base/poco/MongoDB/src/Cursor.cpp @@ -33,6 +33,12 @@ Cursor::Cursor(const std::string& fullCollectionName, QueryRequest::Flags flags) } +Cursor::Cursor(const Document& aggregationResponse) : + _query(aggregationResponse.get("cursor")->get("ns")), + _response(aggregationResponse.get("cursor")->get("id")) +{ +} + Cursor::~Cursor() { try diff --git a/base/poco/MongoDB/src/Database.cpp b/base/poco/MongoDB/src/Database.cpp index 2b31523bdc4..1a0d3cfe559 100644 --- a/base/poco/MongoDB/src/Database.cpp +++ b/base/poco/MongoDB/src/Database.cpp @@ -334,6 +334,50 @@ bool Database::authSCRAM(Connection& connection, const std::string& username, co } +Document::Ptr Database::queryBuildInfo(Connection& connection) const +{ + // build info can be issued on "config" system database + Poco::SharedPtr request = createCommand(); + request->selector().add("buildInfo", 1); + + Poco::MongoDB::ResponseMessage response; + connection.sendRequest(*request, response); + + Document::Ptr buildInfo; + if ( response.documents().size() > 0 ) + { + buildInfo = response.documents()[0]; + } + else + { + throw Poco::ProtocolException("Didn't get a response from the buildinfo command"); + } + return buildInfo; +} + + +Document::Ptr Database::queryServerHello(Connection& connection) const +{ + // hello can be issued on "config" system database + Poco::SharedPtr request = createCommand(); + request->selector().add("hello", 1); + + Poco::MongoDB::ResponseMessage response; + connection.sendRequest(*request, response); + + Document::Ptr hello; + if ( response.documents().size() > 0 ) + { + hello = response.documents()[0]; + } + else + { + throw Poco::ProtocolException("Didn't get a response from the hello command"); + } + return hello; +} + + Int64 Database::count(Connection& connection, const std::string& collectionName) const { Poco::SharedPtr countRequest = createCountRequest(collectionName); @@ -390,7 +434,7 @@ Document::Ptr Database::getLastErrorDoc(Connection& connection) const { Document::Ptr errorDoc; - Poco::SharedPtr request = createQueryRequest("$cmd"); + Poco::SharedPtr request = createCommand(); request->setNumberToReturn(1); request->selector().add("getLastError", 1); @@ -420,7 +464,7 @@ std::string Database::getLastError(Connection& connection) const Poco::SharedPtr Database::createCountRequest(const std::string& collectionName) const { - Poco::SharedPtr request = createQueryRequest("$cmd"); + Poco::SharedPtr request = createCommand(); request->setNumberToReturn(1); request->selector().add("count", collectionName); return request; diff --git a/base/poco/MongoDB/src/DeleteRequest.cpp b/base/poco/MongoDB/src/DeleteRequest.cpp index 67a88c33302..ba75beb55fb 100644 --- a/base/poco/MongoDB/src/DeleteRequest.cpp +++ b/base/poco/MongoDB/src/DeleteRequest.cpp @@ -20,8 +20,8 @@ namespace MongoDB { DeleteRequest::DeleteRequest(const std::string& collectionName, DeleteRequest::Flags flags): - RequestMessage(MessageHeader::OP_DELETE), - _flags(flags), + RequestMessage(MessageHeader::OP_DELETE), + _flags(flags), _fullCollectionName(collectionName), _selector() { diff --git a/base/poco/MongoDB/src/Document.cpp b/base/poco/MongoDB/src/Document.cpp index 114fc993891..f7c5c9c5dc6 100644 --- a/base/poco/MongoDB/src/Document.cpp +++ b/base/poco/MongoDB/src/Document.cpp @@ -35,6 +35,14 @@ Document::~Document() } +Array& Document::addNewArray(const std::string& name) +{ + Array::Ptr newArray = new Array(); + add(name, newArray); + return *newArray; +} + + Element::Ptr Document::get(const std::string& name) const { Element::Ptr element; @@ -84,7 +92,7 @@ void Document::read(BinaryReader& reader) while (type != '\0') { Element::Ptr element; - + std::string name = BSONReader(reader).readCString(); switch (type) @@ -198,7 +206,7 @@ void Document::write(BinaryWriter& writer) else { std::stringstream sstream; - Poco::BinaryWriter tempWriter(sstream); + Poco::BinaryWriter tempWriter(sstream, BinaryWriter::LITTLE_ENDIAN_BYTE_ORDER); for (ElementSet::iterator it = _elements.begin(); it != _elements.end(); ++it) { tempWriter << static_cast((*it)->type()); @@ -207,7 +215,7 @@ void Document::write(BinaryWriter& writer) element->write(tempWriter); } tempWriter.flush(); - + Poco::Int32 len = static_cast(5 + sstream.tellp()); /* 5 = sizeof(len) + 0-byte */ writer << len; writer.writeRaw(sstream.str()); diff --git a/base/poco/MongoDB/src/Element.cpp b/base/poco/MongoDB/src/Element.cpp index 89629e0503e..f91ce264493 100644 --- a/base/poco/MongoDB/src/Element.cpp +++ b/base/poco/MongoDB/src/Element.cpp @@ -24,7 +24,7 @@ Element::Element(const std::string& name) : _name(name) } -Element::~Element() +Element::~Element() { } diff --git a/base/poco/MongoDB/src/GetMoreRequest.cpp b/base/poco/MongoDB/src/GetMoreRequest.cpp index f8a6b73c6ad..2c1f6909eb7 100644 --- a/base/poco/MongoDB/src/GetMoreRequest.cpp +++ b/base/poco/MongoDB/src/GetMoreRequest.cpp @@ -21,7 +21,7 @@ namespace MongoDB { GetMoreRequest::GetMoreRequest(const std::string& collectionName, Int64 cursorID): - RequestMessage(MessageHeader::OP_GET_MORE), + RequestMessage(MessageHeader::OP_GET_MORE), _fullCollectionName(collectionName), _numberToReturn(100), _cursorID(cursorID) diff --git a/base/poco/MongoDB/src/InsertRequest.cpp b/base/poco/MongoDB/src/InsertRequest.cpp index ec8dc9cf94a..65be5654b3e 100644 --- a/base/poco/MongoDB/src/InsertRequest.cpp +++ b/base/poco/MongoDB/src/InsertRequest.cpp @@ -20,7 +20,7 @@ namespace MongoDB { InsertRequest::InsertRequest(const std::string& collectionName, Flags flags): - RequestMessage(MessageHeader::OP_INSERT), + RequestMessage(MessageHeader::OP_INSERT), _flags(flags), _fullCollectionName(collectionName) { diff --git a/base/poco/MongoDB/src/KillCursorsRequest.cpp b/base/poco/MongoDB/src/KillCursorsRequest.cpp index 6baa0e0be8f..448002aa16a 100644 --- a/base/poco/MongoDB/src/KillCursorsRequest.cpp +++ b/base/poco/MongoDB/src/KillCursorsRequest.cpp @@ -37,7 +37,7 @@ void KillCursorsRequest::buildRequest(BinaryWriter& writer) for (std::vector::iterator it = _cursors.begin(); it != _cursors.end(); ++it) { writer << *it; - } + } } diff --git a/base/poco/MongoDB/src/Message.cpp b/base/poco/MongoDB/src/Message.cpp index c29d282d15a..7b1cb23bab6 100644 --- a/base/poco/MongoDB/src/Message.cpp +++ b/base/poco/MongoDB/src/Message.cpp @@ -19,7 +19,7 @@ namespace Poco { namespace MongoDB { -Message::Message(MessageHeader::OpCode opcode): +Message::Message(MessageHeader::OpCode opcode): _header(opcode) { } diff --git a/base/poco/MongoDB/src/MessageHeader.cpp b/base/poco/MongoDB/src/MessageHeader.cpp index 222121243db..b472bcec465 100644 --- a/base/poco/MongoDB/src/MessageHeader.cpp +++ b/base/poco/MongoDB/src/MessageHeader.cpp @@ -20,10 +20,10 @@ namespace Poco { namespace MongoDB { -MessageHeader::MessageHeader(OpCode opCode): - _messageLength(0), - _requestID(0), - _responseTo(0), +MessageHeader::MessageHeader(OpCode opCode): + _messageLength(0), + _requestID(0), + _responseTo(0), _opCode(opCode) { } @@ -42,7 +42,7 @@ void MessageHeader::read(BinaryReader& reader) Int32 opCode; reader >> opCode; - _opCode = (OpCode) opCode; + _opCode = static_cast(opCode); if (!reader.good()) { @@ -56,7 +56,7 @@ void MessageHeader::write(BinaryWriter& writer) writer << _messageLength; writer << _requestID; writer << _responseTo; - writer << (Int32) _opCode; + writer << static_cast(_opCode); } diff --git a/base/poco/MongoDB/src/ObjectId.cpp b/base/poco/MongoDB/src/ObjectId.cpp index 3065a2ffc30..0125c246c2d 100644 --- a/base/poco/MongoDB/src/ObjectId.cpp +++ b/base/poco/MongoDB/src/ObjectId.cpp @@ -32,7 +32,7 @@ ObjectId::ObjectId(const std::string& id) poco_assert_dbg(id.size() == 24); const char* p = id.c_str(); - for (std::size_t i = 0; i < 12; ++i) + for (std::size_t i = 0; i < 12; ++i) { _id[i] = fromHex(p); p += 2; diff --git a/base/poco/MongoDB/src/OpMsgCursor.cpp b/base/poco/MongoDB/src/OpMsgCursor.cpp new file mode 100644 index 00000000000..bc95851ae33 --- /dev/null +++ b/base/poco/MongoDB/src/OpMsgCursor.cpp @@ -0,0 +1,187 @@ +// +// OpMsgCursor.cpp +// +// Library: MongoDB +// Package: MongoDB +// Module: OpMsgCursor +// +// Copyright (c) 2022, Applied Informatics Software Engineering GmbH. +// and Contributors. +// +// SPDX-License-Identifier: BSL-1.0 +// + + +#include "Poco/MongoDB/OpMsgCursor.h" +#include "Poco/MongoDB/Array.h" + +// +// NOTE: +// +// MongoDB specification indicates that the flag MSG_EXHAUST_ALLOWED shall be +// used in the request when the receiver is ready to receive multiple messages +// without sending additional requests in between. Sender (MongoDB) indicates +// that more messages follow with flag MSG_MORE_TO_COME. +// +// It seems that this does not work properly. MSG_MORE_TO_COME is set and reading +// next messages sometimes works, however often the data is missing in response +// or the message header contains wrong message length and reading blocks. +// Opcode in the header is correct. +// +// Using MSG_EXHAUST_ALLOWED is therefore currently disabled. +// +// It seems that related JIRA ticket is: +// +// https://jira.mongodb.org/browse/SERVER-57297 +// +// https://github.com/mongodb/specifications/blob/master/source/message/OP_MSG.rst +// + +#define MONGODB_EXHAUST_ALLOWED_WORKS false + +namespace Poco { +namespace MongoDB { + + +static const std::string keyCursor {"cursor"}; +static const std::string keyFirstBatch {"firstBatch"}; +static const std::string keyNextBatch {"nextBatch"}; + +static Poco::Int64 cursorIdFromResponse(const MongoDB::Document& doc); + + +OpMsgCursor::OpMsgCursor(const std::string& db, const std::string& collection): +#if MONGODB_EXHAUST_ALLOWED_WORKS + _query(db, collection, OpMsgMessage::MSG_EXHAUST_ALLOWED) +#else + _query(db, collection) +#endif +{ +} + +OpMsgCursor::~OpMsgCursor() +{ + try + { + poco_assert_dbg(_cursorID == 0); + } + catch (...) + { + } +} + + +void OpMsgCursor::setEmptyFirstBatch(bool empty) +{ + _emptyFirstBatch = empty; +} + + +bool OpMsgCursor::emptyFirstBatch() const +{ + return _emptyFirstBatch; +} + + +void OpMsgCursor::setBatchSize(Int32 batchSize) +{ + _batchSize = batchSize; +} + + +Int32 OpMsgCursor::batchSize() const +{ + return _batchSize; +} + + +OpMsgMessage& OpMsgCursor::next(Connection& connection) +{ + if (_cursorID == 0) + { + _response.clear(); + + if (_emptyFirstBatch || _batchSize > 0) + { + Int32 bsize = _emptyFirstBatch ? 0 : _batchSize; + if (_query.commandName() == OpMsgMessage::CMD_FIND) + { + _query.body().add("batchSize", bsize); + } + else if (_query.commandName() == OpMsgMessage::CMD_AGGREGATE) + { + auto& cursorDoc = _query.body().addNewDocument("cursor"); + cursorDoc.add("batchSize", bsize); + } + } + + connection.sendRequest(_query, _response); + + const auto& rdoc = _response.body(); + _cursorID = cursorIdFromResponse(rdoc); + } + else + { +#if MONGODB_EXHAUST_ALLOWED_WORKS + std::cout << "Response flags: " << _response.flags() << std::endl; + if (_response.flags() & OpMsgMessage::MSG_MORE_TO_COME) + { + std::cout << "More to come. Reading more response: " << std::endl; + _response.clear(); + connection.readResponse(_response); + } + else +#endif + { + _response.clear(); + _query.setCursor(_cursorID, _batchSize); + connection.sendRequest(_query, _response); + } + } + + const auto& rdoc = _response.body(); + _cursorID = cursorIdFromResponse(rdoc); + + return _response; +} + + +void OpMsgCursor::kill(Connection& connection) +{ + _response.clear(); + if (_cursorID != 0) + { + _query.setCommandName(OpMsgMessage::CMD_KILL_CURSORS); + + MongoDB::Array::Ptr cursors = new MongoDB::Array(); + cursors->add(_cursorID); + _query.body().add("cursors", cursors); + + connection.sendRequest(_query, _response); + + const auto killed = _response.body().get("cursorsKilled", nullptr); + if (!killed || killed->size() != 1 || killed->get(0, -1) != _cursorID) + { + throw Poco::ProtocolException("Cursor not killed as expected: " + std::to_string(_cursorID)); + } + + _cursorID = 0; + _query.clear(); + _response.clear(); + } +} + + +Poco::Int64 cursorIdFromResponse(const MongoDB::Document& doc) +{ + Poco::Int64 id {0}; + auto cursorDoc = doc.get(keyCursor, nullptr); + if(cursorDoc) + { + id = cursorDoc->get("id", 0); + } + return id; +} + + +} } // Namespace Poco::MongoDB diff --git a/base/poco/MongoDB/src/OpMsgMessage.cpp b/base/poco/MongoDB/src/OpMsgMessage.cpp new file mode 100644 index 00000000000..2b55772ca59 --- /dev/null +++ b/base/poco/MongoDB/src/OpMsgMessage.cpp @@ -0,0 +1,412 @@ +// +// OpMsgMessage.cpp +// +// Library: MongoDB +// Package: MongoDB +// Module: OpMsgMessage +// +// Copyright (c) 2022, Applied Informatics Software Engineering GmbH. +// and Contributors. +// +// SPDX-License-Identifier: BSL-1.0 +// + +#include "Poco/MongoDB/OpMsgMessage.h" +#include "Poco/MongoDB/MessageHeader.h" +#include "Poco/MongoDB/Array.h" +#include "Poco/StreamCopier.h" +#include "Poco/Logger.h" + +#define POCO_MONGODB_DUMP false + +namespace Poco { +namespace MongoDB { + +// Query and write +const std::string OpMsgMessage::CMD_INSERT { "insert" }; +const std::string OpMsgMessage::CMD_DELETE { "delete" }; +const std::string OpMsgMessage::CMD_UPDATE { "update" }; +const std::string OpMsgMessage::CMD_FIND { "find" }; +const std::string OpMsgMessage::CMD_FIND_AND_MODIFY { "findAndModify" }; +const std::string OpMsgMessage::CMD_GET_MORE { "getMore" }; + +// Aggregation +const std::string OpMsgMessage::CMD_AGGREGATE { "aggregate" }; +const std::string OpMsgMessage::CMD_COUNT { "count" }; +const std::string OpMsgMessage::CMD_DISTINCT { "distinct" }; +const std::string OpMsgMessage::CMD_MAP_REDUCE { "mapReduce" }; + +// Replication and administration +const std::string OpMsgMessage::CMD_HELLO { "hello" }; +const std::string OpMsgMessage::CMD_REPL_SET_GET_STATUS { "replSetGetStatus" }; +const std::string OpMsgMessage::CMD_REPL_SET_GET_CONFIG { "replSetGetConfig" }; + +const std::string OpMsgMessage::CMD_CREATE { "create" }; +const std::string OpMsgMessage::CMD_CREATE_INDEXES { "createIndexes" }; +const std::string OpMsgMessage::CMD_DROP { "drop" }; +const std::string OpMsgMessage::CMD_DROP_DATABASE { "dropDatabase" }; +const std::string OpMsgMessage::CMD_KILL_CURSORS { "killCursors" }; +const std::string OpMsgMessage::CMD_LIST_DATABASES { "listDatabases" }; +const std::string OpMsgMessage::CMD_LIST_INDEXES { "listIndexes" }; + +// Diagnostic +const std::string OpMsgMessage::CMD_BUILD_INFO { "buildInfo" }; +const std::string OpMsgMessage::CMD_COLL_STATS { "collStats" }; +const std::string OpMsgMessage::CMD_DB_STATS { "dbStats" }; +const std::string OpMsgMessage::CMD_HOST_INFO { "hostInfo" }; + + +static const std::string& commandIdentifier(const std::string& command); + /// Commands have different names for the payload that is sent in a separate section + + +static const std::string keyCursor {"cursor"}; +static const std::string keyFirstBatch {"firstBatch"}; +static const std::string keyNextBatch {"nextBatch"}; + + +OpMsgMessage::OpMsgMessage() : + Message(MessageHeader::OP_MSG) +{ +} + + +OpMsgMessage::OpMsgMessage(const std::string& databaseName, const std::string& collectionName, UInt32 flags) : + Message(MessageHeader::OP_MSG), + _databaseName(databaseName), + _collectionName(collectionName), + _flags(flags) +{ +} + + +OpMsgMessage::~OpMsgMessage() +{ +} + +const std::string& OpMsgMessage::databaseName() const +{ + return _databaseName; +} + + +const std::string& OpMsgMessage::collectionName() const +{ + return _collectionName; +} + + +void OpMsgMessage::setCommandName(const std::string& command) +{ + _commandName = command; + _body.clear(); + + // IMPORTANT: Command name must be first + if (_collectionName.empty()) + { + // Collection is not specified. It is assumed that this particular command does + // not need it. + _body.add(_commandName, Int32(1)); + } + else + { + _body.add(_commandName, _collectionName); + } + _body.add("$db", _databaseName); +} + + +void OpMsgMessage::setCursor(Poco::Int64 cursorID, Poco::Int32 batchSize) +{ + _commandName = OpMsgMessage::CMD_GET_MORE; + _body.clear(); + + // IMPORTANT: Command name must be first + _body.add(_commandName, cursorID); + _body.add("$db", _databaseName); + _body.add("collection", _collectionName); + if (batchSize > 0) + { + _body.add("batchSize", batchSize); + } +} + + +const std::string& OpMsgMessage::commandName() const +{ + return _commandName; +} + + +void OpMsgMessage::setAcknowledgedRequest(bool ack) +{ + const auto& id = commandIdentifier(_commandName); + if (id.empty()) + return; + + _acknowledged = ack; + + auto writeConcern = _body.get("writeConcern", nullptr); + if (writeConcern) + writeConcern->remove("w"); + + if (ack) + { + _flags = _flags & (~MSG_MORE_TO_COME); + } + else + { + _flags = _flags | MSG_MORE_TO_COME; + if (!writeConcern) + _body.addNewDocument("writeConcern").add("w", 0); + else + writeConcern->add("w", 0); + } + +} + + +bool OpMsgMessage::acknowledgedRequest() const +{ + return _acknowledged; +} + + +UInt32 OpMsgMessage::flags() const +{ + return _flags; +} + + +Document& OpMsgMessage::body() +{ + return _body; +} + + +const Document& OpMsgMessage::body() const +{ + return _body; +} + + +Document::Vector& OpMsgMessage::documents() +{ + return _documents; +} + + +const Document::Vector& OpMsgMessage::documents() const +{ + return _documents; +} + + +bool OpMsgMessage::responseOk() const +{ + Poco::Int64 ok {false}; + if (_body.exists("ok")) + { + ok = _body.getInteger("ok"); + } + return (ok != 0); +} + + +void OpMsgMessage::clear() +{ + _flags = MSG_FLAGS_DEFAULT; + _commandName.clear(); + _body.clear(); + _documents.clear(); +} + + +void OpMsgMessage::send(std::ostream& ostr) +{ + BinaryWriter socketWriter(ostr, BinaryWriter::LITTLE_ENDIAN_BYTE_ORDER); + + // Serialise the body + std::stringstream ss; + BinaryWriter writer(ss, BinaryWriter::LITTLE_ENDIAN_BYTE_ORDER); + writer << _flags; + + writer << PAYLOAD_TYPE_0; + _body.write(writer); + + if (!_documents.empty()) + { + // Serialise attached documents + + std::stringstream ssdoc; + BinaryWriter wdoc(ssdoc, BinaryWriter::LITTLE_ENDIAN_BYTE_ORDER); + for (auto& doc: _documents) + { + doc->write(wdoc); + } + wdoc.flush(); + + const std::string& identifier = commandIdentifier(_commandName); + const Poco::Int32 size = static_cast(sizeof(size) + identifier.size() + 1 + ssdoc.tellp()); + writer << PAYLOAD_TYPE_1; + writer << size; + writer.writeCString(identifier.c_str()); + StreamCopier::copyStream(ssdoc, ss); + } + writer.flush(); + +#if POCO_MONGODB_DUMP + const std::string section = ss.str(); + std::string dump; + Logger::formatDump(dump, section.data(), section.length()); + std::cout << dump << std::endl; +#endif + + messageLength(static_cast(ss.tellp())); + + _header.write(socketWriter); + StreamCopier::copyStream(ss, ostr); + + ostr.flush(); +} + + +void OpMsgMessage::read(std::istream& istr) +{ + std::string message; + { + BinaryReader reader(istr, BinaryReader::LITTLE_ENDIAN_BYTE_ORDER); + _header.read(reader); + + poco_assert_dbg(_header.opCode() == _header.OP_MSG); + + const std::streamsize remainingSize {_header.getMessageLength() - _header.MSG_HEADER_SIZE }; + message.reserve(remainingSize); + +#if POCO_MONGODB_DUMP + std::cout + << "Message hdr: " << _header.getMessageLength() << " " << remainingSize << " " + << _header.opCode() << " " << _header.getRequestID() << " " << _header.responseTo() + << std::endl; +#endif + + reader.readRaw(remainingSize, message); + +#if POCO_MONGODB_DUMP + std::string dump; + Logger::formatDump(dump, message.data(), message.length()); + std::cout << dump << std::endl; +#endif + } + // Read complete message and then interpret it. + + std::istringstream msgss(message); + BinaryReader reader(msgss, BinaryReader::LITTLE_ENDIAN_BYTE_ORDER); + + Poco::UInt8 payloadType {0xFF}; + + reader >> _flags; + reader >> payloadType; + poco_assert_dbg(payloadType == PAYLOAD_TYPE_0); + + _body.read(reader); + + // Read next sections from the buffer + while (msgss.good()) + { + // NOTE: Not tested yet with database, because it returns everything in the body. + // Does MongoDB ever return documents as Payload type 1? + reader >> payloadType; + if (!msgss.good()) + { + break; + } + poco_assert_dbg(payloadType == PAYLOAD_TYPE_1); +#if POCO_MONGODB_DUMP + std::cout << "section payload: " << payloadType << std::endl; +#endif + + Poco::Int32 sectionSize {0}; + reader >> sectionSize; + poco_assert_dbg(sectionSize > 0); + +#if POCO_MONGODB_DUMP + std::cout << "section size: " << sectionSize << std::endl; +#endif + std::streamoff offset = sectionSize - sizeof(sectionSize); + std::streampos endOfSection = msgss.tellg() + offset; + + std::string identifier; + reader.readCString(identifier); +#if POCO_MONGODB_DUMP + std::cout << "section identifier: " << identifier << std::endl; +#endif + + // Loop to read documents from this section. + while (msgss.tellg() < endOfSection) + { +#if POCO_MONGODB_DUMP + std::cout << "section doc: " << msgss.tellg() << " " << endOfSection << std::endl; +#endif + Document::Ptr doc = new Document(); + doc->read(reader); + _documents.push_back(doc); + if (msgss.tellg() < 0) + { + break; + } + } + } + + // Extract documents from the cursor batch if they are there. + MongoDB::Array::Ptr batch; + auto curDoc = _body.get(keyCursor, nullptr); + if (curDoc) + { + batch = curDoc->get(keyFirstBatch, nullptr); + if (!batch) + { + batch = curDoc->get(keyNextBatch, nullptr); + } + } + if (batch) + { + for(std::size_t i = 0; i < batch->size(); i++) + { + const auto& d = batch->get(i, nullptr); + if (d) + { + _documents.push_back(d); + } + } + } + +} + +const std::string& commandIdentifier(const std::string& command) +{ + // Names of identifiers for commands that send bulk documents in the request + // The identifier is set in the section type 1. + static std::map identifiers { + { OpMsgMessage::CMD_INSERT, "documents" }, + { OpMsgMessage::CMD_DELETE, "deletes" }, + { OpMsgMessage::CMD_UPDATE, "updates" }, + + // Not sure if create index can send document section + { OpMsgMessage::CMD_CREATE_INDEXES, "indexes" } + }; + + const auto i = identifiers.find(command); + if (i != identifiers.end()) + { + return i->second; + } + + // This likely means that documents are incorrectly set for a command + // that does not send list of documents in section type 1. + static const std::string emptyIdentifier; + return emptyIdentifier; +} + + +} } // namespace Poco::MongoDB diff --git a/base/poco/MongoDB/src/QueryRequest.cpp b/base/poco/MongoDB/src/QueryRequest.cpp index 7044335ba30..6d7d23a8456 100644 --- a/base/poco/MongoDB/src/QueryRequest.cpp +++ b/base/poco/MongoDB/src/QueryRequest.cpp @@ -20,10 +20,10 @@ namespace MongoDB { QueryRequest::QueryRequest(const std::string& collectionName, QueryRequest::Flags flags): - RequestMessage(MessageHeader::OP_QUERY), - _flags(flags), + RequestMessage(MessageHeader::OP_QUERY), + _flags(flags), _fullCollectionName(collectionName), - _numberToSkip(0), + _numberToSkip(0), _numberToReturn(100), _selector(), _returnFieldSelector() diff --git a/base/poco/MongoDB/src/RegularExpression.cpp b/base/poco/MongoDB/src/RegularExpression.cpp index e95e7da82e1..5f7eb6bb51b 100644 --- a/base/poco/MongoDB/src/RegularExpression.cpp +++ b/base/poco/MongoDB/src/RegularExpression.cpp @@ -25,8 +25,8 @@ RegularExpression::RegularExpression() } -RegularExpression::RegularExpression(const std::string& pattern, const std::string& options): - _pattern(pattern), +RegularExpression::RegularExpression(const std::string& pattern, const std::string& options): + _pattern(pattern), _options(options) { } diff --git a/base/poco/MongoDB/src/ReplicaSet.cpp b/base/poco/MongoDB/src/ReplicaSet.cpp index b56fea49311..fce2f2bdada 100644 --- a/base/poco/MongoDB/src/ReplicaSet.cpp +++ b/base/poco/MongoDB/src/ReplicaSet.cpp @@ -21,7 +21,7 @@ namespace Poco { namespace MongoDB { -ReplicaSet::ReplicaSet(const std::vector &addresses): +ReplicaSet::ReplicaSet(const std::vector &addresses): _addresses(addresses) { } @@ -81,8 +81,8 @@ Connection::Ptr ReplicaSet::isMaster(const Net::SocketAddress& address) { conn = 0; } - - return 0; + + return 0; } diff --git a/base/poco/MongoDB/src/RequestMessage.cpp b/base/poco/MongoDB/src/RequestMessage.cpp index 6391d966198..999ed8a6ba1 100644 --- a/base/poco/MongoDB/src/RequestMessage.cpp +++ b/base/poco/MongoDB/src/RequestMessage.cpp @@ -21,7 +21,7 @@ namespace Poco { namespace MongoDB { -RequestMessage::RequestMessage(MessageHeader::OpCode opcode): +RequestMessage::RequestMessage(MessageHeader::OpCode opcode): Message(opcode) { } @@ -35,7 +35,7 @@ RequestMessage::~RequestMessage() void RequestMessage::send(std::ostream& ostr) { std::stringstream ss; - BinaryWriter requestWriter(ss); + BinaryWriter requestWriter(ss, BinaryWriter::LITTLE_ENDIAN_BYTE_ORDER); buildRequest(requestWriter); requestWriter.flush(); diff --git a/base/poco/MongoDB/src/ResponseMessage.cpp b/base/poco/MongoDB/src/ResponseMessage.cpp index 3254ace63e6..e8216767494 100644 --- a/base/poco/MongoDB/src/ResponseMessage.cpp +++ b/base/poco/MongoDB/src/ResponseMessage.cpp @@ -21,10 +21,20 @@ namespace MongoDB { ResponseMessage::ResponseMessage(): - Message(MessageHeader::OP_REPLY), - _responseFlags(0), - _cursorID(0), - _startingFrom(0), + Message(MessageHeader::OP_REPLY), + _responseFlags(0), + _cursorID(0), + _startingFrom(0), + _numberReturned(0) +{ +} + + +ResponseMessage::ResponseMessage(const Int64& cursorID): + Message(MessageHeader::OP_REPLY), + _responseFlags(0), + _cursorID(cursorID), + _startingFrom(0), _numberReturned(0) { } @@ -50,7 +60,7 @@ void ResponseMessage::read(std::istream& istr) clear(); BinaryReader reader(istr, BinaryReader::LITTLE_ENDIAN_BYTE_ORDER); - + _header.read(reader); reader >> _responseFlags; diff --git a/base/poco/MongoDB/src/UpdateRequest.cpp b/base/poco/MongoDB/src/UpdateRequest.cpp index 2af4621ff64..7477fc752d5 100644 --- a/base/poco/MongoDB/src/UpdateRequest.cpp +++ b/base/poco/MongoDB/src/UpdateRequest.cpp @@ -20,7 +20,7 @@ namespace MongoDB { UpdateRequest::UpdateRequest(const std::string& collectionName, UpdateRequest::Flags flags): - RequestMessage(MessageHeader::OP_UPDATE), + RequestMessage(MessageHeader::OP_UPDATE), _flags(flags), _fullCollectionName(collectionName), _selector(), diff --git a/docker/test/integration/runner/compose/docker_compose_mongo.yml b/docker/test/integration/runner/compose/docker_compose_mongo.yml index 9a6eae6ca8c..60361e9e98d 100644 --- a/docker/test/integration/runner/compose/docker_compose_mongo.yml +++ b/docker/test/integration/runner/compose/docker_compose_mongo.yml @@ -1,7 +1,7 @@ version: '2.3' services: mongo1: - image: mongo:5.0 + image: mongo:5.1 restart: always environment: MONGO_INITDB_ROOT_USERNAME: root diff --git a/docker/test/integration/runner/compose/docker_compose_mongo_secure.yml b/docker/test/integration/runner/compose/docker_compose_mongo_secure.yml index 193e5d26568..f5b0ffed130 100644 --- a/docker/test/integration/runner/compose/docker_compose_mongo_secure.yml +++ b/docker/test/integration/runner/compose/docker_compose_mongo_secure.yml @@ -1,7 +1,7 @@ version: '2.3' services: mongo1: - image: mongo:3.6 + image: mongo:3.5 restart: always environment: MONGO_INITDB_ROOT_USERNAME: root diff --git a/src/Dictionaries/MongoDBDictionarySource.cpp b/src/Dictionaries/MongoDBDictionarySource.cpp index b7e342f3c80..46910fa9f6a 100644 --- a/src/Dictionaries/MongoDBDictionarySource.cpp +++ b/src/Dictionaries/MongoDBDictionarySource.cpp @@ -170,7 +170,7 @@ MongoDBDictionarySource::~MongoDBDictionarySource() = default; QueryPipeline MongoDBDictionarySource::loadAll() { - return QueryPipeline(std::make_shared(connection, createCursor(db, collection, sample_block), sample_block, max_block_size)); + return QueryPipeline(std::make_shared(connection, db, collection, Poco::MongoDB::Document{}, sample_block, max_block_size)); } QueryPipeline MongoDBDictionarySource::loadIds(const std::vector & ids) @@ -178,7 +178,7 @@ QueryPipeline MongoDBDictionarySource::loadIds(const std::vector & ids) if (!dict_struct.id) throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'id' is required for selective loading"); - auto cursor = createCursor(db, collection, sample_block); + Poco::MongoDB::Document query; /** NOTE: While building array, Poco::MongoDB requires passing of different unused element names, along with values. * In general, Poco::MongoDB is quite inefficient and bulky. @@ -188,9 +188,9 @@ QueryPipeline MongoDBDictionarySource::loadIds(const std::vector & ids) for (const UInt64 id : ids) ids_array->add(DB::toString(id), static_cast(id)); - cursor->query().selector().addNewDocument(dict_struct.id->name).add("$in", ids_array); + query.addNewDocument(dict_struct.id->name).add("$in", ids_array); - return QueryPipeline(std::make_shared(connection, std::move(cursor), sample_block, max_block_size)); + return QueryPipeline(std::make_shared(connection, db, collection, query, sample_block, max_block_size)); } @@ -199,8 +199,7 @@ QueryPipeline MongoDBDictionarySource::loadKeys(const Columns & key_columns, con if (!dict_struct.key) throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'key' is required for selective loading"); - auto cursor = createCursor(db, collection, sample_block); - + Poco::MongoDB::Document query; Poco::MongoDB::Array::Ptr keys_array(new Poco::MongoDB::Array); for (const auto row_idx : requested_rows) @@ -254,9 +253,9 @@ QueryPipeline MongoDBDictionarySource::loadKeys(const Columns & key_columns, con } /// If more than one key we should use $or - cursor->query().selector().add("$or", keys_array); + query.add("$or", keys_array); - return QueryPipeline(std::make_shared(connection, std::move(cursor), sample_block, max_block_size)); + return QueryPipeline(std::make_shared(connection, db, collection, query, sample_block, max_block_size)); } std::string MongoDBDictionarySource::toString() const diff --git a/src/Dictionaries/MongoDBDictionarySource.h b/src/Dictionaries/MongoDBDictionarySource.h index fefcb1bff9f..6d93bc6c090 100644 --- a/src/Dictionaries/MongoDBDictionarySource.h +++ b/src/Dictionaries/MongoDBDictionarySource.h @@ -16,7 +16,6 @@ namespace Util namespace MongoDB { class Connection; - class Cursor; } } diff --git a/src/Processors/Sources/MongoDBSource.cpp b/src/Processors/Sources/MongoDBSource.cpp index 279a842143f..94b9cb7ad64 100644 --- a/src/Processors/Sources/MongoDBSource.cpp +++ b/src/Processors/Sources/MongoDBSource.cpp @@ -3,10 +3,12 @@ #include #include +#include +#include #include #include +#include #include -#include #include #include @@ -365,27 +367,79 @@ namespace } -std::unique_ptr createCursor(const std::string & database, const std::string & collection, const Block & sample_block_to_select) +bool isMongoDBWireProtocolOld(Poco::MongoDB::Connection & connection_) { - auto cursor = std::make_unique(database, collection); + Poco::MongoDB::Database db("config"); + Poco::MongoDB::Document::Ptr doc = db.queryServerHello(connection_); + auto _wireVersion = doc->getInteger("maxWireVersion"); + return _wireVersion < Poco::MongoDB::Database::WireVersion::VER_36; +} + + +MongoDBCursor::MongoDBCursor( + const std::string & database, + const std::string & collection, + const Block & sample_block_to_select, + const Poco::MongoDB::Document & query, + Poco::MongoDB::Connection & connection) + : is_wire_protocol_old(isMongoDBWireProtocolOld(connection)) +{ + Poco::MongoDB::Document projection; /// Looks like selecting _id column is implicit by default. if (!sample_block_to_select.has("_id")) - cursor->query().returnFieldSelector().add("_id", 0); + projection.add("_id", 0); for (const auto & column : sample_block_to_select) - cursor->query().returnFieldSelector().add(column.name, 1); - return cursor; + projection.add(column.name, 1); + + if (is_wire_protocol_old) + { + old_cursor = std::make_unique(database, collection); + old_cursor->query().selector() = query; + old_cursor->query().returnFieldSelector() = projection; + } + else + { + new_cursor = std::make_unique(database, collection); + new_cursor->query().setCommandName(Poco::MongoDB::OpMsgMessage::CMD_FIND); + new_cursor->query().body().addNewDocument("filter") = query; + new_cursor->query().body().addNewDocument("projection") = projection; + } } +Poco::MongoDB::Document::Vector MongoDBCursor::nextDocuments(Poco::MongoDB::Connection & connection) +{ + if (is_wire_protocol_old) + { + auto response = old_cursor->next(connection); + cursorID_ = response.cursorID(); + return std::move(response.documents()); + } + else + { + auto response = new_cursor->next(connection); + cursorID_ = new_cursor->cursorID(); + return std::move(response.documents()); + } +} + +Int64 MongoDBCursor::cursorID() +{ + return cursorID_; +} + + MongoDBSource::MongoDBSource( std::shared_ptr & connection_, - std::unique_ptr cursor_, + const String & database_name_, + const String & collection_name_, + const Poco::MongoDB::Document & query_, const Block & sample_block, UInt64 max_block_size_) : ISource(sample_block.cloneEmpty()) , connection(connection_) - , cursor{std::move(cursor_)} + , cursor(database_name_, collection_name_, sample_block, query_, *connection_) , max_block_size{max_block_size_} { description.init(sample_block); @@ -412,9 +466,9 @@ Chunk MongoDBSource::generate() size_t num_rows = 0; while (num_rows < max_block_size) { - Poco::MongoDB::ResponseMessage & response = cursor->next(*connection); + auto documents = cursor.nextDocuments(*connection); - for (auto & document : response.documents()) + for (auto & document : documents) { if (document->exists("ok") && document->exists("$err") && document->exists("code") && document->getInteger("ok") == 0) @@ -458,7 +512,7 @@ Chunk MongoDBSource::generate() } } - if (response.cursorID() == 0) + if (cursor.cursorID() == 0) { all_read = true; break; diff --git a/src/Processors/Sources/MongoDBSource.h b/src/Processors/Sources/MongoDBSource.h index d4681d2c05f..f816ccfd1c9 100644 --- a/src/Processors/Sources/MongoDBSource.h +++ b/src/Processors/Sources/MongoDBSource.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -14,7 +15,9 @@ namespace Poco namespace MongoDB { class Connection; + class Document; class Cursor; + class OpMsgCursor; } } @@ -30,7 +33,28 @@ struct MongoDBArrayInfo void authenticate(Poco::MongoDB::Connection & connection, const std::string & database, const std::string & user, const std::string & password); -std::unique_ptr createCursor(const std::string & database, const std::string & collection, const Block & sample_block_to_select); +bool isMongoDBWireProtocolOld(Poco::MongoDB::Connection & connection_); + +class MongoDBCursor +{ +public: + MongoDBCursor( + const std::string & database, + const std::string & collection, + const Block & sample_block_to_select, + const Poco::MongoDB::Document & query, + Poco::MongoDB::Connection & connection); + + Poco::MongoDB::Document::Vector nextDocuments(Poco::MongoDB::Connection & connection); + + Int64 cursorID(); + +private: + const bool is_wire_protocol_old; + std::unique_ptr old_cursor; + std::unique_ptr new_cursor; + Int64 cursorID_ = 0; +}; /// Converts MongoDB Cursor to a stream of Blocks class MongoDBSource final : public ISource @@ -38,7 +62,9 @@ class MongoDBSource final : public ISource public: MongoDBSource( std::shared_ptr & connection_, - std::unique_ptr cursor_, + const String & database_name_, + const String & collection_name_, + const Poco::MongoDB::Document & query_, const Block & sample_block, UInt64 max_block_size_); @@ -50,7 +76,7 @@ private: Chunk generate() override; std::shared_ptr connection; - std::unique_ptr cursor; + MongoDBCursor cursor; const UInt64 max_block_size; ExternalResultDescription description; bool all_read = false; diff --git a/src/Storages/StorageMongoDB.cpp b/src/Storages/StorageMongoDB.cpp index 63b8c2d00a1..2a1d7e80c07 100644 --- a/src/Storages/StorageMongoDB.cpp +++ b/src/Storages/StorageMongoDB.cpp @@ -99,6 +99,7 @@ public: , db_name(db_name_) , metadata_snapshot{metadata_snapshot_} , connection(connection_) + , is_wire_protocol_old(isMongoDBWireProtocolOld(*connection_)) { } @@ -107,7 +108,7 @@ public: void consume(Chunk chunk) override { Poco::MongoDB::Database db(db_name); - Poco::MongoDB::Document::Ptr index = new Poco::MongoDB::Document(); + Poco::MongoDB::Document::Vector documents; auto block = getHeader().cloneWithColumns(chunk.detachColumns()); @@ -118,20 +119,35 @@ public: const auto data_types = block.getDataTypes(); const auto data_names = block.getNames(); - std::vector row(num_cols); + documents.reserve(num_rows); + for (const auto i : collections::range(0, num_rows)) { + Poco::MongoDB::Document::Ptr document = new Poco::MongoDB::Document(); + for (const auto j : collections::range(0, num_cols)) { WriteBufferFromOwnString ostr; data_types[j]->getDefaultSerialization()->serializeText(*columns[j], i, ostr, FormatSettings{}); - row[j] = ostr.str(); - index->add(data_names[j], row[j]); + document->add(data_names[j], ostr.str()); } + + documents.push_back(std::move(document)); + } + + if (is_wire_protocol_old) + { + Poco::SharedPtr insert_request = db.createInsertRequest(collection_name); + insert_request->documents() = std::move(documents); + connection->sendRequest(*insert_request); + } + else + { + Poco::SharedPtr insert_request = db.createOpMsgMessage(collection_name); + insert_request->setCommandName(Poco::MongoDB::OpMsgMessage::CMD_INSERT); + insert_request->documents() = std::move(documents); + connection->sendRequest(*insert_request); } - Poco::SharedPtr insert_request = db.createInsertRequest(collection_name); - insert_request->documents().push_back(index); - connection->sendRequest(*insert_request); } private: @@ -139,6 +155,8 @@ private: String db_name; StorageMetadataPtr metadata_snapshot; std::shared_ptr connection; + + const bool is_wire_protocol_old; }; @@ -162,7 +180,7 @@ Pipe StorageMongoDB::read( sample_block.insert({ column_data.type, column_data.name }); } - return Pipe(std::make_shared(connection, createCursor(database_name, collection_name, sample_block), sample_block, max_block_size)); + return Pipe(std::make_shared(connection, database_name, collection_name, Poco::MongoDB::Document{}, sample_block, max_block_size)); } SinkToStoragePtr StorageMongoDB::write(const ASTPtr & /* query */, const StorageMetadataPtr & metadata_snapshot, ContextPtr /* context */) diff --git a/tests/integration/test_storage_mongodb/test.py b/tests/integration/test_storage_mongodb/test.py index 6ba5520704d..e6e77c64515 100644 --- a/tests/integration/test_storage_mongodb/test.py +++ b/tests/integration/test_storage_mongodb/test.py @@ -71,6 +71,39 @@ def test_simple_select(started_cluster): simple_mongo_table.drop() +def test_simple_select_from_view(started_cluster): + mongo_connection = get_mongo_connection(started_cluster) + db = mongo_connection["test"] + db.add_user("root", "clickhouse") + simple_mongo_table = db["simple_table"] + data = [] + for i in range(0, 100): + data.append({"key": i, "data": hex(i * i)}) + simple_mongo_table.insert_many(data) + simple_mongo_table_view = db.create_collection( + "simple_table_view", viewOn="simple_table" + ) + + node = started_cluster.instances["node"] + node.query( + "CREATE TABLE simple_mongo_table(key UInt64, data String) ENGINE = MongoDB('mongo2:27017', 'test', 'simple_table_view', 'root', 'clickhouse')" + ) + + assert node.query("SELECT COUNT() FROM simple_mongo_table") == "100\n" + assert ( + node.query("SELECT sum(key) FROM simple_mongo_table") + == str(sum(range(0, 100))) + "\n" + ) + + assert ( + node.query("SELECT data from simple_mongo_table where key = 42") + == hex(42 * 42) + "\n" + ) + node.query("DROP TABLE simple_mongo_table") + simple_mongo_table_view.drop() + simple_mongo_table.drop() + + @pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) def test_arrays(started_cluster): mongo_connection = get_mongo_connection(started_cluster) @@ -411,13 +444,16 @@ def test_simple_insert_select(started_cluster): node.query( "CREATE TABLE simple_mongo_table(key UInt64, data String) ENGINE = MongoDB('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse')" ) - node.query("INSERT INTO simple_mongo_table SELECT 1, 'kek'") + node.query( + "INSERT INTO simple_mongo_table SELECT number, 'kek' || toString(number) FROM numbers(10)" + ) assert ( - node.query("SELECT data from simple_mongo_table where key = 1").strip() == "kek" + node.query("SELECT data from simple_mongo_table where key = 7").strip() + == "kek7" ) node.query("INSERT INTO simple_mongo_table(key) SELECT 12") - assert int(node.query("SELECT count() from simple_mongo_table")) == 2 + assert int(node.query("SELECT count() from simple_mongo_table")) == 11 assert ( node.query("SELECT data from simple_mongo_table where key = 12").strip() == "" ) From 9ea0575ff8c7ac04d46ec93d012debf01eda55c5 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Mon, 22 May 2023 11:24:29 +0000 Subject: [PATCH 0103/1072] Update: rest tests which output is differ with enabled analyzer --- .../01655_plan_optimizations.reference | 55 ++++++++++++++++- .../0_stateless/01655_plan_optimizations.sh | 60 ++++++++++++++++--- 2 files changed, 107 insertions(+), 8 deletions(-) diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index 48d99647b43..9796d2e4f82 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -1,5 +1,4 @@ Too many optimizations applied to query plan -Too many optimizations applied to query plan > sipHash should be calculated after filtration FUNCTION sipHash64 Filter column: equals @@ -27,6 +26,11 @@ COLUMN Const(UInt8) -> notEquals(y, 0) Aggregating Filter Filter +> (analyzer) filter should be pushed down after aggregating, column after aggregation is const +COLUMN Const(UInt8) -> notEquals(y_1, 0_UInt8) +Aggregating +Filter +Filter 0 1 1 1 2 1 2 3 1 @@ -42,6 +46,11 @@ Filter column ALIAS notEquals(s, 4) :: 1 -> and(notEquals(y, 0), notEquals(s, 4)) Aggregating Filter column: notEquals(y, 0) +> (analyzer) one condition of filter should be pushed down after aggregating, other condition is aliased +Filter column +ALIAS notEquals(s_0, 4_UInt8) :: 0 -> and(notEquals(y_1, 0_UInt8), notEquals(s_0, 4_UInt8)) +Aggregating +Filter column: notEquals(y_1, 0_UInt8) 0 1 1 2 2 3 @@ -56,6 +65,11 @@ Filter column FUNCTION and(minus(s, 4) :: 1, 1 :: 3) -> and(notEquals(y, 0), minus(s, 4)) UInt8 : 2 Aggregating Filter column: notEquals(y, 0) +> (analyzer) one condition of filter should be pushed down after aggregating, other condition is casted +Filter column +FUNCTION and(minus(s_0, 4_UInt8) :: 0, 1 :: 3) -> and(notEquals(y_1, 0_UInt8), minus(s_0, 4_UInt8)) UInt8 : 2 +Aggregating +Filter column: notEquals(y_1, 0_UInt8) 0 1 1 2 2 3 @@ -70,6 +84,11 @@ Filter column FUNCTION and(minus(s, 8) :: 1, minus(s, 4) :: 2) -> and(notEquals(y, 0), minus(s, 8), minus(s, 4)) Aggregating Filter column: notEquals(y, 0) +> (analyzer) one condition of filter should be pushed down after aggregating, other two conditions are ANDed +Filter column +FUNCTION and(minus(s_0, 8_UInt8) :: 0, minus(s_0, 4_UInt8) :: 2) -> and(notEquals(y_1, 0_UInt8), minus(s_0, 8_UInt8), minus(s_0, 4_UInt8)) +Aggregating +Filter column: notEquals(y_1, 0_UInt8) 0 1 1 2 2 3 @@ -83,6 +102,11 @@ Filter column ALIAS notEquals(s, 8) :: 1 -> and(notEquals(y, 0), notEquals(s, 8), minus(y, 4)) Aggregating Filter column: and(notEquals(y, 0), minus(y, 4)) +> (analyzer) two conditions of filter should be pushed down after aggregating and ANDed, one condition is aliased +Filter column +ALIAS notEquals(s_0, 8_UInt8) :: 0 -> and(notEquals(y_1, 0_UInt8), notEquals(s_0, 8_UInt8), minus(y_1, 4_UInt8)) +Aggregating +Filter column: and(notEquals(y_1, 0_UInt8), minus(y_1, 4_UInt8)) 0 1 1 2 2 3 @@ -95,11 +119,19 @@ Filter column: and(notEquals(y, 0), minus(y, 4)) Filter column: and(notEquals(y, 2), notEquals(x, 0)) ARRAY JOIN x Filter column: notEquals(y, 2) +> (analyzer) filter is split, one part is filtered before ARRAY JOIN +Filter column: and(notEquals(y_1, 2_UInt8), notEquals(x_0, 0_UInt8)) +ARRAY JOIN x_0 +Filter column: notEquals(y_1, 2_UInt8) 1 3 > filter is pushed down before Distinct Distinct Distinct Filter column: notEquals(y, 2) +> (analyzer) filter is pushed down before Distinct +Distinct +Distinct +Filter column: notEquals(y_1, 2_UInt8) 0 0 0 1 1 0 @@ -108,12 +140,20 @@ Filter column: notEquals(y, 2) Sorting Sorting Filter column: and(notEquals(x, 0), notEquals(y, 0)) +> (analyzer) filter is pushed down before sorting steps +Sorting +Sorting +Filter column: and(notEquals(x_0, 0_UInt8), notEquals(y_1, 0_UInt8)) 1 2 1 1 > filter is pushed down before TOTALS HAVING and aggregating TotalsHaving Aggregating Filter column: notEquals(y, 2) +> (analyzer) filter is pushed down before TOTALS HAVING and aggregating +TotalsHaving +Aggregating +Filter column: notEquals(y_0, 2_UInt8) 0 12 1 15 3 10 @@ -129,12 +169,18 @@ Filter Join Filter column: notEquals(number, 1) Join +> (analyzer) one condition of filter is pushed down before LEFT JOIN +Join +Filter column: notEquals(l.number_0, 1_UInt8) 0 0 3 3 > one condition of filter is pushed down before INNER JOIN Join Filter column: notEquals(number, 1) Join +> (analyzer) one condition of filter is pushed down before INNER JOIN +Join +Filter column: notEquals(l.number_0, 1_UInt8) 3 3 > filter is pushed down before UNION Union @@ -149,5 +195,12 @@ FUNCTION sipHash64 Sorting Expression (Before ORDER BY) FUNCTION plus +> (analyzer) function calculation should be done after sorting and limit (if possible) +> Expression should be divided into two subexpressions and only one of them should be moved after Sorting +Expression ((Project names + (Before ORDER BY + (Projection + Change column names to column identifiers)) [lifted up part])) +FUNCTION sipHash64 +Sorting +Expression ((Before ORDER BY + (Projection + Change column names to column identifiers))) +FUNCTION plus > this query should be executed without throwing an exception 0 diff --git a/tests/queries/0_stateless/01655_plan_optimizations.sh b/tests/queries/0_stateless/01655_plan_optimizations.sh index 7c299f9cc26..d68c2c8b414 100755 --- a/tests/queries/0_stateless/01655_plan_optimizations.sh +++ b/tests/queries/0_stateless/01655_plan_optimizations.sh @@ -124,11 +124,17 @@ $CLICKHOUSE_CLIENT -q " settings enable_optimize_predicate_expression=0" echo "> filter is split, one part is filtered before ARRAY JOIN" -$CLICKHOUSE_CLIENT -q " +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=0 -q " explain actions = 1 select x, y from ( select range(number) as x, number + 1 as y from numbers(3) ) array join x where y != 2 and x != 0" | grep -o "Filter column: and(notEquals(y, 2), notEquals(x, 0))\|ARRAY JOIN x\|Filter column: notEquals(y, 2)" +echo "> (analyzer) filter is split, one part is filtered before ARRAY JOIN" +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=1 -q " + explain actions = 1 select x, y from ( + select range(number) as x, number + 1 as y from numbers(3) + ) array join x where y != 2 and x != 0" | + grep -o "Filter column: and(notEquals(y_1, 2_UInt8), notEquals(x_0, 0_UInt8))\|ARRAY JOIN x_0\|Filter column: notEquals(y_1, 2_UInt8)" $CLICKHOUSE_CLIENT -q " select x, y from ( select range(number) as x, number + 1 as y from numbers(3) @@ -148,12 +154,19 @@ $CLICKHOUSE_CLIENT -q " # settings enable_optimize_predicate_expression=0" echo "> filter is pushed down before Distinct" -$CLICKHOUSE_CLIENT -q " +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=0 -q " explain actions = 1 select x, y from ( select distinct x, y from (select number % 2 as x, number % 3 as y from numbers(10)) ) where y != 2 settings enable_optimize_predicate_expression=0" | grep -o "Distinct\|Filter column: notEquals(y, 2)" +echo "> (analyzer) filter is pushed down before Distinct" +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=1 -q " + explain actions = 1 select x, y from ( + select distinct x, y from (select number % 2 as x, number % 3 as y from numbers(10)) + ) where y != 2 + settings enable_optimize_predicate_expression=0" | + grep -o "Distinct\|Filter column: notEquals(y_1, 2_UInt8)" $CLICKHOUSE_CLIENT -q " select x, y from ( select distinct x, y from (select number % 2 as x, number % 3 as y from numbers(10)) @@ -161,12 +174,19 @@ $CLICKHOUSE_CLIENT -q " settings enable_optimize_predicate_expression=0" echo "> filter is pushed down before sorting steps" -$CLICKHOUSE_CLIENT --convert_query_to_cnf=0 -q " +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=0 --convert_query_to_cnf=0 -q " explain actions = 1 select x, y from ( select number % 2 as x, number % 3 as y from numbers(6) order by y desc ) where x != 0 and y != 0 settings enable_optimize_predicate_expression = 0" | grep -o "Sorting\|Filter column: and(notEquals(x, 0), notEquals(y, 0))" +echo "> (analyzer) filter is pushed down before sorting steps" +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=1 --convert_query_to_cnf=0 -q " + explain actions = 1 select x, y from ( + select number % 2 as x, number % 3 as y from numbers(6) order by y desc + ) where x != 0 and y != 0 + settings enable_optimize_predicate_expression = 0" | + grep -o "Sorting\|Filter column: and(notEquals(x_0, 0_UInt8), notEquals(y_1, 0_UInt8))" $CLICKHOUSE_CLIENT -q " select x, y from ( select number % 2 as x, number % 3 as y from numbers(6) order by y desc @@ -174,12 +194,19 @@ $CLICKHOUSE_CLIENT -q " settings enable_optimize_predicate_expression = 0" echo "> filter is pushed down before TOTALS HAVING and aggregating" -$CLICKHOUSE_CLIENT -q " +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=0 -q " explain actions = 1 select * from ( select y, sum(x) from (select number as x, number % 4 as y from numbers(10)) group by y with totals ) where y != 2 settings enable_optimize_predicate_expression=0" | grep -o "TotalsHaving\|Aggregating\|Filter column: notEquals(y, 2)" +echo "> (analyzer) filter is pushed down before TOTALS HAVING and aggregating" +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=1 -q " + explain actions = 1 select * from ( + select y, sum(x) from (select number as x, number % 4 as y from numbers(10)) group by y with totals + ) where y != 2 + settings enable_optimize_predicate_expression=0" | + grep -o "TotalsHaving\|Aggregating\|Filter column: notEquals(y_0, 2_UInt8)" $CLICKHOUSE_CLIENT -q " select * from ( select y, sum(x) from (select number as x, number % 4 as y from numbers(10)) group by y with totals @@ -197,24 +224,38 @@ $CLICKHOUSE_CLIENT -q " ) where number != 2 settings enable_optimize_predicate_expression=0" echo "> one condition of filter is pushed down before LEFT JOIN" -$CLICKHOUSE_CLIENT -q " +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=0 -q " explain actions = 1 select number as a, r.b from numbers(4) as l any left join ( select number + 2 as b from numbers(3) ) as r on a = r.b where a != 1 and b != 2 settings enable_optimize_predicate_expression = 0" | grep -o "Join\|Filter column: notEquals(number, 1)" +echo "> (analyzer) one condition of filter is pushed down before LEFT JOIN" +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=1 -q " + explain actions = 1 + select number as a, r.b from numbers(4) as l any left join ( + select number + 2 as b from numbers(3) + ) as r on a = r.b where a != 1 and b != 2 settings enable_optimize_predicate_expression = 0" | + grep -o "Join\|Filter column: notEquals(l.number_0, 1_UInt8)" $CLICKHOUSE_CLIENT -q " select number as a, r.b from numbers(4) as l any left join ( select number + 2 as b from numbers(3) ) as r on a = r.b where a != 1 and b != 2 settings enable_optimize_predicate_expression = 0" | sort echo "> one condition of filter is pushed down before INNER JOIN" -$CLICKHOUSE_CLIENT -q " +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=0 -q " explain actions = 1 select number as a, r.b from numbers(4) as l any inner join ( select number + 2 as b from numbers(3) ) as r on a = r.b where a != 1 and b != 2 settings enable_optimize_predicate_expression = 0" | grep -o "Join\|Filter column: notEquals(number, 1)" +echo "> (analyzer) one condition of filter is pushed down before INNER JOIN" +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=1 -q " + explain actions = 1 + select number as a, r.b from numbers(4) as l any inner join ( + select number + 2 as b from numbers(3) + ) as r on a = r.b where a != 1 and b != 2 settings enable_optimize_predicate_expression = 0" | + grep -o "Join\|Filter column: notEquals(l.number_0, 1_UInt8)" $CLICKHOUSE_CLIENT -q " select number as a, r.b from numbers(4) as l any inner join ( select number + 2 as b from numbers(3) @@ -233,7 +274,12 @@ $CLICKHOUSE_CLIENT -q " echo "> function calculation should be done after sorting and limit (if possible)" echo "> Expression should be divided into two subexpressions and only one of them should be moved after Sorting" -$CLICKHOUSE_CLIENT -q " +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=0 -q " + explain actions = 1 select number as n, sipHash64(n) from numbers(100) order by number + 1 limit 5" | + sed 's/^ *//g' | grep -o "^ *\(Expression (.*Before ORDER BY.*)\|Sorting\|FUNCTION \w\+\)" +echo "> (analyzer) function calculation should be done after sorting and limit (if possible)" +echo "> Expression should be divided into two subexpressions and only one of them should be moved after Sorting" +$CLICKHOUSE_CLIENT --allow_experimental_analyzer=1 -q " explain actions = 1 select number as n, sipHash64(n) from numbers(100) order by number + 1 limit 5" | sed 's/^ *//g' | grep -o "^ *\(Expression (.*Before ORDER BY.*)\|Sorting\|FUNCTION \w\+\)" echo "> this query should be executed without throwing an exception" From 90f4b1777832a8e6c3a343671be091c727f3e065 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Mon, 22 May 2023 15:45:18 +0000 Subject: [PATCH 0104/1072] Fix build & test --- .../test/integration/runner/compose/docker_compose_mongo.yml | 4 ++-- src/Processors/Sources/MongoDBSource.cpp | 2 +- src/Processors/Sources/MongoDBSource.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/test/integration/runner/compose/docker_compose_mongo.yml b/docker/test/integration/runner/compose/docker_compose_mongo.yml index 60361e9e98d..8cdcbc421e8 100644 --- a/docker/test/integration/runner/compose/docker_compose_mongo.yml +++ b/docker/test/integration/runner/compose/docker_compose_mongo.yml @@ -1,7 +1,7 @@ version: '2.3' services: mongo1: - image: mongo:5.1 + image: mongo:6.0 restart: always environment: MONGO_INITDB_ROOT_USERNAME: root @@ -11,7 +11,7 @@ services: command: --profile=2 --verbose mongo2: - image: mongo:5.0 + image: mongo:6.0 restart: always ports: - ${MONGO_NO_CRED_EXTERNAL_PORT:-27017}:${MONGO_NO_CRED_INTERNAL_PORT:-27017} diff --git a/src/Processors/Sources/MongoDBSource.cpp b/src/Processors/Sources/MongoDBSource.cpp index 94b9cb7ad64..74dfa13158c 100644 --- a/src/Processors/Sources/MongoDBSource.cpp +++ b/src/Processors/Sources/MongoDBSource.cpp @@ -424,7 +424,7 @@ Poco::MongoDB::Document::Vector MongoDBCursor::nextDocuments(Poco::MongoDB::Conn } } -Int64 MongoDBCursor::cursorID() +Int64 MongoDBCursor::cursorID() const { return cursorID_; } diff --git a/src/Processors/Sources/MongoDBSource.h b/src/Processors/Sources/MongoDBSource.h index f816ccfd1c9..2bc5481e20b 100644 --- a/src/Processors/Sources/MongoDBSource.h +++ b/src/Processors/Sources/MongoDBSource.h @@ -47,7 +47,7 @@ public: Poco::MongoDB::Document::Vector nextDocuments(Poco::MongoDB::Connection & connection); - Int64 cursorID(); + Int64 cursorID() const; private: const bool is_wire_protocol_old; From b8305503d89783b6700ae2c43f69b96798181b03 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 22 May 2023 19:07:18 +0200 Subject: [PATCH 0105/1072] more flexible cleanup thread scheduling --- base/base/interpolate.h | 5 + src/Storages/MergeTree/IMergeTreeDataPart.cpp | 10 +- src/Storages/MergeTree/IMergeTreeDataPart.h | 3 + src/Storages/MergeTree/MergeTreeData.cpp | 26 ++- src/Storages/MergeTree/MergeTreeData.h | 6 + src/Storages/MergeTree/MergeTreeSettings.h | 4 +- .../ReplicatedMergeTreeCleanupThread.cpp | 171 +++++++++++++++--- .../ReplicatedMergeTreeCleanupThread.h | 28 ++- .../MergeTree/SimpleMergeSelector.cpp | 8 +- src/Storages/StorageReplicatedMergeTree.cpp | 9 +- src/Storages/StorageReplicatedMergeTree.h | 4 +- tests/config/config.d/merge_tree.xml | 2 + .../test.py | 3 +- .../test_broken_part_during_merge/test.py | 2 +- .../test.py | 6 +- tests/integration/test_drop_replica/test.py | 15 +- tests/integration/test_jbod_balancer/test.py | 1 + tests/integration/test_jbod_ha/test.py | 1 + tests/integration/test_lost_part/test.py | 8 +- tests/integration/test_multiple_disks/test.py | 6 +- .../test_old_parts_finally_removed/test.py | 5 +- .../test_parts_delete_zookeeper/test.py | 2 +- .../integration/test_recovery_replica/test.py | 2 +- tests/integration/test_storage_nats/test.py | 3 +- .../integration/test_storage_rabbitmq/test.py | 6 +- tests/integration/test_system_metrics/test.py | 4 +- tests/integration/test_ttl_replicated/test.py | 3 +- ..._replace_partition_from_table_zookeeper.sh | 8 +- .../00652_replicated_mutations_zookeeper.sh | 6 +- ...ated_minimalistic_part_header_zookeeper.sh | 6 +- ...0953_zookeeper_suetin_deduplication_bug.sh | 2 +- .../00988_parallel_parts_removal.sql | 4 +- ...tem_parts_race_condition_zookeeper_long.sh | 10 +- ...tem_parts_race_condition_drop_zookeeper.sh | 3 +- ...034_move_partition_from_table_zookeeper.sh | 6 +- ...ent_move_partition_from_table_zookeeper.sh | 3 +- ...076_parallel_alter_replicated_zookeeper.sh | 3 +- ...9_parallel_alter_detach_table_zookeeper.sh | 5 +- .../01103_optimize_drop_race_zookeeper.sh | 4 +- .../0_stateless/01158_zookeeper_log_long.sql | 2 +- ...nactive_replica_cleanup_nodes_zookeeper.sh | 6 +- ...e_condition_rename_clear_zookeeper_long.sh | 4 +- .../01509_parallel_quorum_and_merge_long.sh | 3 +- ...nt_ttl_and_normal_merges_zookeeper_long.sh | 3 +- .../0_stateless/02067_lost_part_s3.sql | 12 +- .../02370_lost_part_intersecting_merges.sh | 2 +- .../02396_system_parts_race_condition_rm.sh | 8 +- ...397_system_parts_race_condition_drop_rm.sh | 4 +- .../02432_s3_parallel_parts_cleanup.sql | 4 +- .../02448_clone_replica_lost_part.sql | 6 +- ..._projection_and_mutation_work_together.sql | 6 +- .../02515_cleanup_async_insert_block_ids.sh | 2 +- 52 files changed, 353 insertions(+), 112 deletions(-) diff --git a/base/base/interpolate.h b/base/base/interpolate.h index 1d4fc0b6257..4c27f70c95b 100644 --- a/base/base/interpolate.h +++ b/base/base/interpolate.h @@ -11,3 +11,8 @@ constexpr double interpolateExponential(double min, double max, double ratio) assert(min > 0 && ratio >= 0 && ratio <= 1); return min * std::pow(max / min, ratio); } + +constexpr double interpolateLinear(double min, double max, double ratio) +{ + return std::lerp(min, max, ratio); +} diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index d27b03fff44..3d2b6ecc540 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -211,9 +211,9 @@ void IMergeTreeDataPart::MinMaxIndex::appendFiles(const MergeTreeData & data, St } -static void incrementStateMetric(MergeTreeDataPartState state) +void IMergeTreeDataPart::incrementStateMetric(MergeTreeDataPartState state_) const { - switch (state) + switch (state_) { case MergeTreeDataPartState::Temporary: CurrentMetrics::add(CurrentMetrics::PartsTemporary); @@ -227,6 +227,7 @@ static void incrementStateMetric(MergeTreeDataPartState state) CurrentMetrics::add(CurrentMetrics::PartsCommitted); return; case MergeTreeDataPartState::Outdated: + storage.total_outdated_parts_count.fetch_add(1, std::memory_order_relaxed); CurrentMetrics::add(CurrentMetrics::PartsOutdated); return; case MergeTreeDataPartState::Deleting: @@ -238,9 +239,9 @@ static void incrementStateMetric(MergeTreeDataPartState state) } } -static void decrementStateMetric(MergeTreeDataPartState state) +void IMergeTreeDataPart::decrementStateMetric(MergeTreeDataPartState state_) const { - switch (state) + switch (state_) { case MergeTreeDataPartState::Temporary: CurrentMetrics::sub(CurrentMetrics::PartsTemporary); @@ -254,6 +255,7 @@ static void decrementStateMetric(MergeTreeDataPartState state) CurrentMetrics::sub(CurrentMetrics::PartsCommitted); return; case MergeTreeDataPartState::Outdated: + storage.total_outdated_parts_count.fetch_sub(1, std::memory_order_relaxed); CurrentMetrics::sub(CurrentMetrics::PartsOutdated); return; case MergeTreeDataPartState::Deleting: diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 388d96314c0..ecc1523b6c0 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -623,6 +623,9 @@ private: /// for this column with default parameters. CompressionCodecPtr detectDefaultCompressionCodec() const; + void incrementStateMetric(MergeTreeDataPartState state) const; + void decrementStateMetric(MergeTreeDataPartState state) const; + mutable MergeTreeDataPartState state{MergeTreeDataPartState::Temporary}; /// This ugly flag is needed for debug assertions only diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index b21f44baeb5..5cfc4c577dc 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -81,6 +81,7 @@ #include #include +#include #include #include @@ -4311,6 +4312,29 @@ size_t MergeTreeData::getActivePartsCount() const } +size_t MergeTreeData::getOutdatedPartsCount() const +{ + return total_outdated_parts_count.load(std::memory_order_relaxed); +} + +size_t MergeTreeData::getNumberOfOutdatedPartsWithExpiredRemovalTime() const +{ + size_t res = 0; + + auto time_now = time(nullptr); + + auto parts_lock = lockParts(); + auto outdated_parts_range = getDataPartsStateRange(DataPartState::Outdated); + for (const auto & part : outdated_parts_range) + { + auto part_remove_time = part->remove_time.load(std::memory_order_relaxed); + if (part_remove_time <= time_now && time_now - part_remove_time >= getSettings()->old_parts_lifetime.totalSeconds() && part.unique()) + ++res; + } + + return res; +} + std::pair MergeTreeData::getMaxPartsCountAndSizeForPartitionWithState(DataPartState state) const { auto lock = lockParts(); @@ -4519,7 +4543,7 @@ void MergeTreeData::delayMutationOrThrowIfNeeded(Poco::Event * until, const Cont size_t allowed_mutations_over_threshold = num_mutations_to_throw - num_mutations_to_delay; double delay_factor = std::min(static_cast(mutations_over_threshold) / allowed_mutations_over_threshold, 1.0); - size_t delay_milliseconds = static_cast(std::lerp(settings->min_delay_to_mutate_ms, settings->max_delay_to_mutate_ms, delay_factor)); + size_t delay_milliseconds = static_cast(interpolateLinear(settings->min_delay_to_mutate_ms, settings->max_delay_to_mutate_ms, delay_factor)); ProfileEvents::increment(ProfileEvents::DelayedMutations); ProfileEvents::increment(ProfileEvents::DelayedMutationsMilliseconds, delay_milliseconds); diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 5488ce72631..4a71c24e6d3 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -532,6 +532,10 @@ public: size_t getActivePartsCount() const; + size_t getOutdatedPartsCount() const; + + size_t getNumberOfOutdatedPartsWithExpiredRemovalTime() const; + /// Returns a pair with: max number of parts in partition across partitions; sum size of parts inside that partition. /// (if there are multiple partitions with max number of parts, the sum size of parts is returned for arbitrary of them) std::pair getMaxPartsCountAndSizeForPartitionWithState(DataPartState state) const; @@ -1491,6 +1495,8 @@ private: std::atomic total_active_size_rows = 0; std::atomic total_active_size_parts = 0; + mutable std::atomic total_outdated_parts_count = 0; + // Record all query ids which access the table. It's guarded by `query_id_set_mutex` and is always mutable. mutable std::set query_id_set TSA_GUARDED_BY(query_id_set_mutex); mutable std::mutex query_id_set_mutex; diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index c9e81ce9103..78d703e795c 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -120,8 +120,10 @@ struct Settings; \ /** Check delay of replicas settings. */ \ M(UInt64, min_relative_delay_to_measure, 120, "Calculate relative replica delay only if absolute delay is not less that this value.", 0) \ - M(UInt64, cleanup_delay_period, 30, "Period to clean old queue logs, blocks hashes and parts.", 0) \ + M(UInt64, cleanup_delay_period, 30, "Minimum period to clean old queue logs, blocks hashes and parts.", 0) \ + M(UInt64, max_cleanup_delay_period, 300, "Maximum period to clean old queue logs, blocks hashes and parts.", 0) \ M(UInt64, cleanup_delay_period_random_add, 10, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \ + M(UInt64, cleanup_thread_preferred_points_per_iteration, 150, "Preferred batch size for background cleanup (points are abstract but 1 point is approximately equivalent to 1 inserted block).", 0) \ M(UInt64, min_relative_delay_to_close, 300, "Minimal delay from other replicas to close, stop serving requests and not return Ok during status check.", 0) \ M(UInt64, min_absolute_delay_to_close, 0, "Minimal absolute delay to close, stop serving requests and not return Ok during status check.", 0) \ M(UInt64, enable_vertical_merge_algorithm, 1, "Enable usage of Vertical merge algorithm.", 0) \ diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp index 0409cadc1e9..35a860ebb42 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp @@ -25,19 +25,22 @@ ReplicatedMergeTreeCleanupThread::ReplicatedMergeTreeCleanupThread(StorageReplic : storage(storage_) , log_name(storage.getStorageID().getFullTableName() + " (ReplicatedMergeTreeCleanupThread)") , log(&Poco::Logger::get(log_name)) + , sleep_ms(storage.getSettings()->cleanup_delay_period * 1000) { task = storage.getContext()->getSchedulePool().createTask(log_name, [this]{ run(); }); } void ReplicatedMergeTreeCleanupThread::run() { - auto storage_settings = storage.getSettings(); - const auto sleep_ms = storage_settings->cleanup_delay_period * 1000 - + std::uniform_int_distribution(0, storage_settings->cleanup_delay_period_random_add * 1000)(rng); + SCOPE_EXIT({ is_running.store(false, std::memory_order_relaxed); }); + is_running.store(true, std::memory_order_relaxed); + auto storage_settings = storage.getSettings(); + + Float32 cleanup_points = 0; try { - iterate(); + cleanup_points = iterate(); } catch (const Coordination::Exception & e) { @@ -51,39 +54,144 @@ void ReplicatedMergeTreeCleanupThread::run() tryLogCurrentException(log, __PRETTY_FUNCTION__); } + UInt64 prev_timestamp = prev_cleanup_timestamp_ms.load(std::memory_order_relaxed); + UInt64 now_ms = clock_gettime_ns_adjusted(prev_timestamp * 1'000'000) / 1'000'000; + + /// Do not adjust sleep_ms on the first run after starting the server + if (prev_timestamp && storage_settings->cleanup_thread_preferred_points_per_iteration) + { + /// We don't want to run the task too often when the table was barely changed and there's almost nothing to cleanup. + /// But we cannot simply sleep max_cleanup_delay_period (300s) when nothing was cleaned up and cleanup_delay_period (30s) + /// when we removed something, because inserting one part per 30s will lead to running cleanup each 30s just to remove one part. + /// So we need some interpolation based on preferred batch size. + auto expected_cleanup_points = storage_settings->cleanup_thread_preferred_points_per_iteration; + + /// How long should we sleep to remove cleanup_thread_preferred_points_per_iteration on the next iteration? + Float32 ratio = cleanup_points / expected_cleanup_points; + if (ratio == 0) + sleep_ms = storage_settings->max_cleanup_delay_period * 1000; + else + sleep_ms = static_cast(sleep_ms / ratio); + + if (sleep_ms < storage_settings->cleanup_delay_period * 1000) + sleep_ms = storage_settings->cleanup_delay_period * 1000; + if (storage_settings->max_cleanup_delay_period * 1000 < sleep_ms) + sleep_ms = storage_settings->max_cleanup_delay_period * 1000; + + UInt64 interval_ms = now_ms - prev_timestamp; + LOG_TRACE(log, "Scheduling next cleanup after {}ms (points: {}, interval: {}ms, ratio: {}, points per minute: {})", + sleep_ms, cleanup_points, interval_ms, ratio, cleanup_points / interval_ms * 60'000); + } + prev_cleanup_timestamp_ms.store(now_ms, std::memory_order_relaxed); + + sleep_ms += std::uniform_int_distribution(0, storage_settings->cleanup_delay_period_random_add * 1000)(rng); task->scheduleAfter(sleep_ms); } - -void ReplicatedMergeTreeCleanupThread::iterate() +void ReplicatedMergeTreeCleanupThread::wakeupEarlierIfNeeded() { - storage.clearOldPartsAndRemoveFromZK(); + /// It may happen that the tables was idle for a long time, but then a user started to aggressively insert (or mutate) data. + /// In this case, sleep_ms was set to the highest possible value, the task is not going to wake up soon, + /// but the number of objects to clean up is growing. We need to wakeup the task earlier. + auto storage_settings = storage.getSettings(); + if (!storage_settings->cleanup_thread_preferred_points_per_iteration) + return; + + /// The number of other objects (logs, blocks, etc) is usually correlated with the number of Outdated parts. + /// Do not wake up unless we have too many. + size_t number_of_outdated_objects = storage.getOutdatedPartsCount(); + if (number_of_outdated_objects < storage_settings->cleanup_thread_preferred_points_per_iteration * 2) + return; + + /// A race condition is possible here, but it's okay + if (is_running.load(std::memory_order_relaxed)) + return; + + /// Do not re-check all parts too often (avoid constantly calling getNumberOfOutdatedPartsWithExpiredRemovalTime()) + if (!wakeup_check_timer.compareAndRestart(storage_settings->cleanup_delay_period / 4)) + return; + + UInt64 prev_run_timestamp_ms = prev_cleanup_timestamp_ms.load(std::memory_order_relaxed); + UInt64 now_ms = clock_gettime_ns_adjusted(prev_run_timestamp_ms * 1'000'000) / 1'000'000; + if (!prev_run_timestamp_ms || now_ms <= prev_run_timestamp_ms) + return; + + /// Don't run it more often than cleanup_delay_period + UInt64 seconds_passed = (now_ms - prev_run_timestamp_ms) / 1000; + if (seconds_passed < storage_settings->cleanup_delay_period) + return; + + /// Do not count parts that cannot be removed anyway. Do not wake up unless we have too many. + number_of_outdated_objects = storage.getNumberOfOutdatedPartsWithExpiredRemovalTime(); + if (number_of_outdated_objects < storage_settings->cleanup_thread_preferred_points_per_iteration * 2) + return; + + LOG_TRACE(log, "Waking up cleanup thread because there are {} outdated objects and previous cleanup finished {}s ago", + number_of_outdated_objects, seconds_passed); + + wakeup(); +} + + +Float32 ReplicatedMergeTreeCleanupThread::iterate() +{ + size_t cleaned_logs = 0; + Float32 cleaned_blocks = 0; + size_t cleaned_other = 0; + size_t cleaned_part_like = 0; + size_t cleaned_parts = storage.clearOldPartsAndRemoveFromZK(); + + auto storage_settings = storage.getSettings(); { auto lock = storage.lockForShare(RWLockImpl::NO_QUERY, storage.getSettings()->lock_acquire_timeout_for_background_operations); /// Both use relative_data_path which changes during rename, so we /// do it under share lock - storage.clearOldWriteAheadLogs(); - storage.clearOldTemporaryDirectories(storage.getSettings()->temporary_directories_lifetime.totalSeconds()); + cleaned_other += storage.clearOldWriteAheadLogs(); + cleaned_part_like += storage.clearOldTemporaryDirectories(storage.getSettings()->temporary_directories_lifetime.totalSeconds()); if (storage.getSettings()->merge_tree_enable_clear_old_broken_detached) - storage.clearOldBrokenPartsFromDetachedDirectory(); + cleaned_part_like += storage.clearOldBrokenPartsFromDetachedDirectory(); } /// This is loose condition: no problem if we actually had lost leadership at this moment /// and two replicas will try to do cleanup simultaneously. if (storage.is_leader) { - clearOldLogs(); - auto storage_settings = storage.getSettings(); - clearOldBlocks("blocks", storage_settings->replicated_deduplication_window_seconds, storage_settings->replicated_deduplication_window, cached_block_stats_for_sync_inserts); - clearOldBlocks("async_blocks", storage_settings->replicated_deduplication_window_seconds_for_async_inserts, storage_settings->replicated_deduplication_window_for_async_inserts, cached_block_stats_for_async_inserts); - clearOldMutations(); - storage.clearEmptyParts(); + cleaned_logs = clearOldLogs(); + size_t normal_blocks = clearOldBlocks("blocks", storage_settings->replicated_deduplication_window_seconds, + storage_settings->replicated_deduplication_window, cached_block_stats_for_sync_inserts); + + size_t async_blocks = clearOldBlocks("async_blocks", + storage_settings->replicated_deduplication_window_seconds_for_async_inserts, + storage_settings->replicated_deduplication_window_for_async_inserts, + cached_block_stats_for_async_inserts); + + /// Many async blocks are transformed into one ordinary block + Float32 async_blocks_per_block = static_cast(storage_settings->replicated_deduplication_window) / + (storage_settings->replicated_deduplication_window_for_async_inserts + 1); + cleaned_blocks = (normal_blocks + async_blocks * async_blocks_per_block) / 2; + + cleaned_other += clearOldMutations(); + cleaned_part_like += storage.clearEmptyParts(); } + + /// We need to measure the number of removed objects somehow (for better scheduling), + /// but just summing the number of removed async blocks, logs, and empty parts does not make any sense. + /// So we are trying to (approximately) measure the number of inserted blocks/parts, so we will be able to compare apples to apples. + + /// Each inserted block produces 3 objects that have to be cleaned up: one block, one log entry and one part. + /// A few new parts get merged together producing one log entry and one part. + + /// Other objects (like mutations and WALs) are much more rare than Outdated parts (because mutations usually produce + /// many Outdated parts, and WALs usually contain many parts too). We count then as one part for simplicity. + + constexpr Float32 parts_number_amplification = 1.3f; /// Assuming we merge 4-5 parts each time + Float32 cleaned_inserted_parts = (cleaned_blocks + (cleaned_logs + cleaned_parts) / parts_number_amplification) / 3; + return cleaned_inserted_parts + cleaned_part_like + cleaned_other; } -void ReplicatedMergeTreeCleanupThread::clearOldLogs() +size_t ReplicatedMergeTreeCleanupThread::clearOldLogs() { auto zookeeper = storage.getZooKeeper(); auto storage_settings = storage.getSettings(); @@ -102,7 +210,7 @@ void ReplicatedMergeTreeCleanupThread::clearOldLogs() size_t min_replicated_logs_to_keep = static_cast(storage_settings->min_replicated_logs_to_keep * ratio); if (static_cast(children_count) < min_replicated_logs_to_keep) - return; + return 0; Strings replicas = zookeeper->getChildren(storage.zookeeper_path + "/replicas", &stat); @@ -114,7 +222,7 @@ void ReplicatedMergeTreeCleanupThread::clearOldLogs() Strings entries = zookeeper->getChildren(storage.zookeeper_path + "/log"); if (entries.empty()) - return; + return 0; ::sort(entries.begin(), entries.end()); @@ -227,7 +335,7 @@ void ReplicatedMergeTreeCleanupThread::clearOldLogs() entries.erase(std::lower_bound(entries.begin(), entries.end(), "log-" + padIndex(min_saved_log_pointer)), entries.end()); if (entries.empty()) - return; + return 0; markLostReplicas( host_versions_lost_replicas, @@ -268,6 +376,8 @@ void ReplicatedMergeTreeCleanupThread::clearOldLogs() if (i != 0) LOG_DEBUG(log, "Removed {} old log entries: {} - {}", i, entries[0], entries[i - 1]); + + return i; } @@ -323,7 +433,7 @@ struct ReplicatedMergeTreeCleanupThread::NodeWithStat } }; -void ReplicatedMergeTreeCleanupThread::clearOldBlocks(const String & blocks_dir_name, UInt64 window_seconds, UInt64 window_size, NodeCTimeAndVersionCache & cached_block_stats) +size_t ReplicatedMergeTreeCleanupThread::clearOldBlocks(const String & blocks_dir_name, UInt64 window_seconds, UInt64 window_size, NodeCTimeAndVersionCache & cached_block_stats) { auto zookeeper = storage.getZooKeeper(); @@ -331,7 +441,7 @@ void ReplicatedMergeTreeCleanupThread::clearOldBlocks(const String & blocks_dir_ getBlocksSortedByTime(blocks_dir_name, *zookeeper, timed_blocks, cached_block_stats); if (timed_blocks.empty()) - return; + return 0; /// Use ZooKeeper's first node (last according to time) timestamp as "current" time. Int64 current_time = timed_blocks.front().ctime; @@ -350,7 +460,7 @@ void ReplicatedMergeTreeCleanupThread::clearOldBlocks(const String & blocks_dir_ auto num_nodes_to_delete = timed_blocks.end() - first_outdated_block; if (!num_nodes_to_delete) - return; + return 0; auto last_outdated_block = timed_blocks.end() - 1; LOG_TRACE(log, "Will clear {} old blocks from {} (ctime {}) to {} (ctime {})", num_nodes_to_delete, @@ -388,6 +498,7 @@ void ReplicatedMergeTreeCleanupThread::clearOldBlocks(const String & blocks_dir_ } LOG_TRACE(log, "Cleared {} old blocks from ZooKeeper", num_nodes_to_delete); + return num_nodes_to_delete; } @@ -456,17 +567,17 @@ void ReplicatedMergeTreeCleanupThread::getBlocksSortedByTime(const String & bloc } -void ReplicatedMergeTreeCleanupThread::clearOldMutations() +size_t ReplicatedMergeTreeCleanupThread::clearOldMutations() { auto storage_settings = storage.getSettings(); if (!storage_settings->finished_mutations_to_keep) - return; + return 0; if (storage.queue.countFinishedMutations() <= storage_settings->finished_mutations_to_keep) { /// Not strictly necessary, but helps to avoid unnecessary ZooKeeper requests. /// If even this replica hasn't finished enough mutations yet, then we don't need to clean anything. - return; + return 0; } auto zookeeper = storage.getZooKeeper(); @@ -481,7 +592,7 @@ void ReplicatedMergeTreeCleanupThread::clearOldMutations() // No Need to check return value to delete mutations. zookeeper->tryGet(storage.zookeeper_path + "/replicas/" + replica + "/mutation_pointer", pointer); if (pointer.empty()) - return; /// One replica hasn't done anything yet so we can't delete any mutations. + return 0; /// One replica hasn't done anything yet so we can't delete any mutations. min_pointer = std::min(parse(pointer), min_pointer); } @@ -492,11 +603,11 @@ void ReplicatedMergeTreeCleanupThread::clearOldMutations() entries.erase(std::upper_bound(entries.begin(), entries.end(), padIndex(min_pointer)), entries.end()); /// Do not remove last `storage_settings->finished_mutations_to_keep` entries. if (entries.size() <= storage_settings->finished_mutations_to_keep) - return; + return 0; entries.erase(entries.end() - storage_settings->finished_mutations_to_keep, entries.end()); if (entries.empty()) - return; + return 0; Coordination::Requests ops; size_t batch_start_i = 0; @@ -526,6 +637,8 @@ void ReplicatedMergeTreeCleanupThread::clearOldMutations() ops.clear(); } } + + return entries.size(); } } diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.h b/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.h index 76b9ee4a575..57de7944970 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -31,6 +32,8 @@ public: void stop() { task->deactivate(); } + void wakeupEarlierIfNeeded(); + private: StorageReplicatedMergeTree & storage; String log_name; @@ -38,11 +41,20 @@ private: BackgroundSchedulePool::TaskHolder task; pcg64 rng{randomSeed()}; - void run(); - void iterate(); + UInt64 sleep_ms; - /// Remove old records from ZooKeeper. - void clearOldLogs(); + std::atomic prev_cleanup_timestamp_ms = 0; + std::atomic is_running = false; + + AtomicStopwatch wakeup_check_timer; + + void run(); + + /// Returns a number this is directly proportional to the number of cleaned up blocks + Float32 iterate(); + + /// Remove old records from ZooKeeper. Returns the number of removed logs + size_t clearOldLogs(); /// The replica is marked as "lost" if it is inactive and its log pointer /// is far behind and we are not going to keep logs for it. @@ -52,11 +64,11 @@ private: size_t replicas_count, const zkutil::ZooKeeperPtr & zookeeper); using NodeCTimeAndVersionCache = std::map>; - /// Remove old block hashes from ZooKeeper. This is done by the leader replica. - void clearOldBlocks(const String & blocks_dir_name, UInt64 window_seconds, UInt64 window_size, NodeCTimeAndVersionCache & cached_block_stats); + /// Remove old block hashes from ZooKeeper. This is done by the leader replica. Returns the number of removed blocks + size_t clearOldBlocks(const String & blocks_dir_name, UInt64 window_seconds, UInt64 window_size, NodeCTimeAndVersionCache & cached_block_stats); - /// Remove old mutations that are done from ZooKeeper. This is done by the leader replica. - void clearOldMutations(); + /// Remove old mutations that are done from ZooKeeper. This is done by the leader replica. Returns the number of removed mutations + size_t clearOldMutations(); NodeCTimeAndVersionCache cached_block_stats_for_sync_inserts; NodeCTimeAndVersionCache cached_block_stats_for_async_inserts; diff --git a/src/Storages/MergeTree/SimpleMergeSelector.cpp b/src/Storages/MergeTree/SimpleMergeSelector.cpp index af3373fd175..7e7539f71d5 100644 --- a/src/Storages/MergeTree/SimpleMergeSelector.cpp +++ b/src/Storages/MergeTree/SimpleMergeSelector.cpp @@ -28,7 +28,7 @@ struct Estimator { double difference = std::abs(log2(static_cast(sum_size) / size_prev_at_left)); if (difference < settings.heuristic_to_align_parts_max_absolute_difference_in_powers_of_two) - current_score *= std::lerp(settings.heuristic_to_align_parts_max_score_adjustment, 1, + current_score *= interpolateLinear(settings.heuristic_to_align_parts_max_score_adjustment, 1, difference / settings.heuristic_to_align_parts_max_absolute_difference_in_powers_of_two); } @@ -115,8 +115,8 @@ bool allow( // std::cerr << "size_normalized: " << size_normalized << "\n"; /// Calculate boundaries for age - double min_age_to_lower_base = std::lerp(settings.min_age_to_lower_base_at_min_size, settings.min_age_to_lower_base_at_max_size, size_normalized); - double max_age_to_lower_base = std::lerp(settings.max_age_to_lower_base_at_min_size, settings.max_age_to_lower_base_at_max_size, size_normalized); + double min_age_to_lower_base = interpolateLinear(settings.min_age_to_lower_base_at_min_size, settings.min_age_to_lower_base_at_max_size, size_normalized); + double max_age_to_lower_base = interpolateLinear(settings.max_age_to_lower_base_at_min_size, settings.max_age_to_lower_base_at_max_size, size_normalized); // std::cerr << "min_age_to_lower_base: " << min_age_to_lower_base << "\n"; // std::cerr << "max_age_to_lower_base: " << max_age_to_lower_base << "\n"; @@ -137,7 +137,7 @@ bool allow( // std::cerr << "combined_ratio: " << combined_ratio << "\n"; - double lowered_base = std::lerp(settings.base, 2.0, combined_ratio); + double lowered_base = interpolateLinear(settings.base, 2.0, combined_ratio); // std::cerr << "------- lowered_base: " << lowered_base << "\n"; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index d9c8f09ccf1..2b948e1fd60 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -3147,6 +3147,8 @@ bool StorageReplicatedMergeTree::processQueueEntry(ReplicatedMergeTreeQueue::Sel bool StorageReplicatedMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assignee) { + cleanup_thread.wakeupEarlierIfNeeded(); + /// If replication queue is stopped exit immediately as we successfully executed the task if (queue.actions_blocker.isCancelled()) return false; @@ -6589,7 +6591,7 @@ bool StorageReplicatedMergeTree::hasLightweightDeletedMask() const return has_lightweight_delete_parts.load(std::memory_order_relaxed); } -void StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZK() +size_t StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZK() { auto table_lock = lockForShare( RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations); @@ -6598,8 +6600,9 @@ void StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZK() /// Now these parts are in Deleting state. If we fail to remove some of them we must roll them back to Outdated state. /// Otherwise they will not be deleted. DataPartsVector parts = grabOldParts(); + size_t total_parts_to_remove = parts.size(); if (parts.empty()) - return; + return total_parts_to_remove; DataPartsVector parts_to_delete_only_from_filesystem; // Only duplicates DataPartsVector parts_to_delete_completely; // All parts except duplicates @@ -6707,6 +6710,8 @@ void StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZK() /// Otherwise nobody will try to remove them again (see grabOldParts). delete_parts_from_fs_and_rollback_in_case_of_error(parts_to_remove_from_filesystem, "old"); } + + return total_parts_to_remove; } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 29b6a4d6817..01b86dd1425 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -342,8 +342,8 @@ public: private: std::atomic_bool are_restoring_replica {false}; - /// Delete old parts from disk and from ZooKeeper. - void clearOldPartsAndRemoveFromZK(); + /// Delete old parts from disk and from ZooKeeper. Returns the number of removed parts + size_t clearOldPartsAndRemoveFromZK(); template friend class ReplicatedMergeTreeSinkImpl; diff --git a/tests/config/config.d/merge_tree.xml b/tests/config/config.d/merge_tree.xml index 43bdb6aa07b..5521e5ba515 100644 --- a/tests/config/config.d/merge_tree.xml +++ b/tests/config/config.d/merge_tree.xml @@ -1,5 +1,7 @@ 8 + 60 + 10 diff --git a/tests/integration/test_broken_detached_part_clean_up/test.py b/tests/integration/test_broken_detached_part_clean_up/test.py index 5b18fa34494..9a70ebe0d48 100644 --- a/tests/integration/test_broken_detached_part_clean_up/test.py +++ b/tests/integration/test_broken_detached_part_clean_up/test.py @@ -141,7 +141,8 @@ def test_remove_broken_detached_part_replicated_merge_tree(started_cluster): merge_tree_enable_clear_old_broken_detached=1, merge_tree_clear_old_broken_detached_parts_ttl_timeout_seconds=5, cleanup_delay_period=1, - cleanup_delay_period_random_add=0; + cleanup_delay_period_random_add=0, + cleanup_thread_preferred_points_per_iteration=0; """ ) diff --git a/tests/integration/test_broken_part_during_merge/test.py b/tests/integration/test_broken_part_during_merge/test.py index f4110844466..26962236869 100644 --- a/tests/integration/test_broken_part_during_merge/test.py +++ b/tests/integration/test_broken_part_during_merge/test.py @@ -25,7 +25,7 @@ def test_merge_and_part_corruption(started_cluster): """ CREATE TABLE replicated_mt(date Date, id UInt32, value Int32) ENGINE = ReplicatedMergeTree('/clickhouse/tables/replicated_mt', '{replica}') ORDER BY id - SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1; + SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0; """.format( replica=node1.name ) diff --git a/tests/integration/test_consistant_parts_after_move_partition/test.py b/tests/integration/test_consistant_parts_after_move_partition/test.py index 63a51472773..91fa884c093 100644 --- a/tests/integration/test_consistant_parts_after_move_partition/test.py +++ b/tests/integration/test_consistant_parts_after_move_partition/test.py @@ -14,11 +14,13 @@ def initialize_database(nodes, shard): CREATE TABLE `{database}`.src (p UInt64, d UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/tables/test_consistent_shard1{shard}/replicated', '{replica}') ORDER BY d PARTITION BY p - SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0; + SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, + cleanup_delay_period=0, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0; CREATE TABLE `{database}`.dest (p UInt64, d UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/tables/test_consistent_shard2{shard}/replicated', '{replica}') ORDER BY d PARTITION BY p - SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0; + SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, + cleanup_delay_period=0, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0; """.format( shard=shard, replica=node.name, database=CLICKHOUSE_DATABASE ) diff --git a/tests/integration/test_drop_replica/test.py b/tests/integration/test_drop_replica/test.py index e87edb0a578..0941e664982 100644 --- a/tests/integration/test_drop_replica/test.py +++ b/tests/integration/test_drop_replica/test.py @@ -11,7 +11,8 @@ def fill_nodes(nodes, shard): CREATE DATABASE test; CREATE TABLE test.test_table(date Date, id UInt32) - ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/{shard}/replicated/test_table', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0; + ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/{shard}/replicated/test_table', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) + SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0; """.format( shard=shard, replica=node.name ) @@ -22,7 +23,8 @@ def fill_nodes(nodes, shard): CREATE DATABASE test1; CREATE TABLE test1.test_table(date Date, id UInt32) - ENGINE = ReplicatedMergeTree('/clickhouse/tables/test1/{shard}/replicated/test_table', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0; + ENGINE = ReplicatedMergeTree('/clickhouse/tables/test1/{shard}/replicated/test_table', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) + SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0; """.format( shard=shard, replica=node.name ) @@ -33,7 +35,8 @@ def fill_nodes(nodes, shard): CREATE DATABASE test2; CREATE TABLE test2.test_table(date Date, id UInt32) - ENGINE = ReplicatedMergeTree('/clickhouse/tables/test2/{shard}/replicated/test_table', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0; + ENGINE = ReplicatedMergeTree('/clickhouse/tables/test2/{shard}/replicated/test_table', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) + SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0; """.format( shard=shard, replica=node.name ) @@ -44,7 +47,8 @@ def fill_nodes(nodes, shard): CREATE DATABASE test3; CREATE TABLE test3.test_table(date Date, id UInt32) - ENGINE = ReplicatedMergeTree('/clickhouse/tables/test3/{shard}/replicated/test_table', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0; + ENGINE = ReplicatedMergeTree('/clickhouse/tables/test3/{shard}/replicated/test_table', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) + SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0; """.format( shard=shard, replica=node.name ) @@ -55,7 +59,8 @@ def fill_nodes(nodes, shard): CREATE DATABASE test4; CREATE TABLE test4.test_table(date Date, id UInt32) - ENGINE = ReplicatedMergeTree('/clickhouse/tables/test4/{shard}/replicated/test_table', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0; + ENGINE = ReplicatedMergeTree('/clickhouse/tables/test4/{shard}/replicated/test_table', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) + SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0; """.format( shard=shard, replica=node.name ) diff --git a/tests/integration/test_jbod_balancer/test.py b/tests/integration/test_jbod_balancer/test.py index df34a075d5a..4797eec5381 100644 --- a/tests/integration/test_jbod_balancer/test.py +++ b/tests/integration/test_jbod_balancer/test.py @@ -134,6 +134,7 @@ def test_replicated_balanced_merge_fetch(start_cluster): old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 2, + cleanup_thread_preferred_points_per_iteration=0, min_bytes_to_rebalance_partition_over_jbod = 1024, max_bytes_to_merge_at_max_space_in_pool = 4096 """.format( diff --git a/tests/integration/test_jbod_ha/test.py b/tests/integration/test_jbod_ha/test.py index 5cbb5989ff3..033d751912a 100644 --- a/tests/integration/test_jbod_ha/test.py +++ b/tests/integration/test_jbod_ha/test.py @@ -58,6 +58,7 @@ def test_jbod_ha(start_cluster): old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 2, + cleanup_thread_preferred_points_per_iteration=0, max_bytes_to_merge_at_max_space_in_pool = 4096 """.format( i diff --git a/tests/integration/test_lost_part/test.py b/tests/integration/test_lost_part/test.py index dd4c2105d55..44cd19fd1fb 100644 --- a/tests/integration/test_lost_part/test.py +++ b/tests/integration/test_lost_part/test.py @@ -42,7 +42,7 @@ def test_lost_part_same_replica(start_cluster): for node in [node1, node2]: node.query( f"CREATE TABLE mt0 (id UInt64, date Date) ENGINE ReplicatedMergeTree('/clickhouse/tables/t', '{node.name}') ORDER BY tuple() PARTITION BY date " - "SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1" + "SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0" ) node1.query("SYSTEM STOP MERGES mt0") @@ -109,7 +109,7 @@ def test_lost_part_other_replica(start_cluster): for node in [node1, node2]: node.query( f"CREATE TABLE mt1 (id UInt64) ENGINE ReplicatedMergeTree('/clickhouse/tables/t1', '{node.name}') ORDER BY tuple() " - "SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1" + "SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0" ) node1.query("SYSTEM STOP MERGES mt1") @@ -178,7 +178,7 @@ def test_lost_part_mutation(start_cluster): for node in [node1, node2]: node.query( f"CREATE TABLE mt2 (id UInt64) ENGINE ReplicatedMergeTree('/clickhouse/tables/t2', '{node.name}') ORDER BY tuple() " - "SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1" + "SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0" ) node1.query("SYSTEM STOP MERGES mt2") @@ -241,7 +241,7 @@ def test_lost_last_part(start_cluster): for node in [node1, node2]: node.query( f"CREATE TABLE mt3 (id UInt64, p String) ENGINE ReplicatedMergeTree('/clickhouse/tables/t3', '{node.name}') " - "ORDER BY tuple() PARTITION BY p SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1" + "ORDER BY tuple() PARTITION BY p SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0" ) node1.query("SYSTEM STOP MERGES mt3") diff --git a/tests/integration/test_multiple_disks/test.py b/tests/integration/test_multiple_disks/test.py index 0e51df017b2..54e7f6dd8ee 100644 --- a/tests/integration/test_multiple_disks/test.py +++ b/tests/integration/test_multiple_disks/test.py @@ -1528,7 +1528,8 @@ def test_simple_replication_and_moves(start_cluster): s1 String ) ENGINE = ReplicatedMergeTree('/clickhouse/replicated_table_for_moves', '{}') ORDER BY tuple() - SETTINGS storage_policy='moving_jbod_with_external', old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=2 + SETTINGS storage_policy='moving_jbod_with_external', old_parts_lifetime=1, + cleanup_delay_period=1, cleanup_delay_period_random_add=2, cleanup_thread_preferred_points_per_iteration=0 """.format( i + 1 ) @@ -1609,7 +1610,8 @@ def test_download_appropriate_disk(start_cluster): s1 String ) ENGINE = ReplicatedMergeTree('/clickhouse/replicated_table_for_download', '{}') ORDER BY tuple() - SETTINGS storage_policy='moving_jbod_with_external', old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=2 + SETTINGS storage_policy='moving_jbod_with_external', old_parts_lifetime=1, + cleanup_delay_period=1, cleanup_delay_period_random_add=2, cleanup_thread_preferred_points_per_iteration=0 """.format( i + 1 ) diff --git a/tests/integration/test_old_parts_finally_removed/test.py b/tests/integration/test_old_parts_finally_removed/test.py index 5347d433419..cbd701588d5 100644 --- a/tests/integration/test_old_parts_finally_removed/test.py +++ b/tests/integration/test_old_parts_finally_removed/test.py @@ -27,7 +27,8 @@ def started_cluster(): def test_part_finally_removed(started_cluster): node1.query( - "CREATE TABLE drop_outdated_part (Key UInt64) ENGINE = ReplicatedMergeTree('/table/d', '1') ORDER BY tuple() SETTINGS old_parts_lifetime=10, cleanup_delay_period=10, cleanup_delay_period_random_add=1" + "CREATE TABLE drop_outdated_part (Key UInt64) ENGINE = ReplicatedMergeTree('/table/d', '1') ORDER BY tuple() " + "SETTINGS old_parts_lifetime=10, cleanup_delay_period=10, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0" ) node1.query("INSERT INTO drop_outdated_part VALUES (1)") @@ -44,7 +45,7 @@ def test_part_finally_removed(started_cluster): ) node1.query( - "ALTER TABLE drop_outdated_part MODIFY SETTING old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=1" + "ALTER TABLE drop_outdated_part MODIFY SETTING old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0" ) for i in range(60): diff --git a/tests/integration/test_parts_delete_zookeeper/test.py b/tests/integration/test_parts_delete_zookeeper/test.py index a78aefa4595..9fd07e7b65d 100644 --- a/tests/integration/test_parts_delete_zookeeper/test.py +++ b/tests/integration/test_parts_delete_zookeeper/test.py @@ -21,7 +21,7 @@ def start_cluster(): CREATE DATABASE test; CREATE TABLE test_table(date Date, id UInt32) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/replicated', 'node1') - ORDER BY id PARTITION BY toYYYYMM(date) SETTINGS old_parts_lifetime=4, cleanup_delay_period=1; + ORDER BY id PARTITION BY toYYYYMM(date) SETTINGS old_parts_lifetime=4, cleanup_delay_period=1, cleanup_thread_preferred_points_per_iteration=0; """ ) diff --git a/tests/integration/test_recovery_replica/test.py b/tests/integration/test_recovery_replica/test.py index 0a63da4db22..582e018f5d2 100644 --- a/tests/integration/test_recovery_replica/test.py +++ b/tests/integration/test_recovery_replica/test.py @@ -4,7 +4,7 @@ import pytest from helpers.cluster import ClickHouseCluster from helpers.test_tools import assert_eq_with_retry -SETTINGS = "SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0" +SETTINGS = "SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0" def fill_nodes(nodes): diff --git a/tests/integration/test_storage_nats/test.py b/tests/integration/test_storage_nats/test.py index 1d7e046864b..4d7e4cf813d 100644 --- a/tests/integration/test_storage_nats/test.py +++ b/tests/integration/test_storage_nats/test.py @@ -931,7 +931,8 @@ def test_nats_overloaded_insert(nats_cluster): CREATE TABLE test.view_overload (key UInt64, value UInt64) ENGINE = MergeTree ORDER BY key - SETTINGS old_parts_lifetime=5, cleanup_delay_period=2, cleanup_delay_period_random_add=3; + SETTINGS old_parts_lifetime=5, cleanup_delay_period=2, cleanup_delay_period_random_add=3, + cleanup_thread_preferred_points_per_iteration=0; CREATE MATERIALIZED VIEW test.consumer_overload TO test.view_overload AS SELECT * FROM test.nats_consume; """ diff --git a/tests/integration/test_storage_rabbitmq/test.py b/tests/integration/test_storage_rabbitmq/test.py index 4e1e28373e3..b4dcf86e0ba 100644 --- a/tests/integration/test_storage_rabbitmq/test.py +++ b/tests/integration/test_storage_rabbitmq/test.py @@ -642,7 +642,8 @@ def test_rabbitmq_sharding_between_queues_publish(rabbitmq_cluster): CREATE TABLE test.view (key UInt64, value UInt64, channel_id String) ENGINE = MergeTree ORDER BY key - SETTINGS old_parts_lifetime=5, cleanup_delay_period=2, cleanup_delay_period_random_add=3; + SETTINGS old_parts_lifetime=5, cleanup_delay_period=2, cleanup_delay_period_random_add=3, + cleanup_thread_preferred_points_per_iteration=0; CREATE MATERIALIZED VIEW test.consumer TO test.view AS SELECT *, _channel_id AS channel_id FROM test.rabbitmq; """ @@ -1116,7 +1117,8 @@ def test_rabbitmq_direct_exchange(rabbitmq_cluster): CREATE TABLE test.destination(key UInt64, value UInt64) ENGINE = MergeTree() ORDER BY key - SETTINGS old_parts_lifetime=5, cleanup_delay_period=2, cleanup_delay_period_random_add=3; + SETTINGS old_parts_lifetime=5, cleanup_delay_period=2, cleanup_delay_period_random_add=3, + cleanup_thread_preferred_points_per_iteration=0; """ ) diff --git a/tests/integration/test_system_metrics/test.py b/tests/integration/test_system_metrics/test.py index 9ebe198a109..338622b824e 100644 --- a/tests/integration/test_system_metrics/test.py +++ b/tests/integration/test_system_metrics/test.py @@ -13,7 +13,9 @@ def fill_nodes(nodes, shard): CREATE DATABASE test; CREATE TABLE test.test_table(date Date, id UInt32) - ENGINE = ReplicatedMergeTree('/clickhouse/tables/test{shard}/replicated', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0; + ENGINE = ReplicatedMergeTree('/clickhouse/tables/test{shard}/replicated', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) + SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, + cleanup_delay_period=0, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0; """.format( shard=shard, replica=node.name ) diff --git a/tests/integration/test_ttl_replicated/test.py b/tests/integration/test_ttl_replicated/test.py index a3e7d6e4b8b..4ea4472b812 100644 --- a/tests/integration/test_ttl_replicated/test.py +++ b/tests/integration/test_ttl_replicated/test.py @@ -422,7 +422,8 @@ def test_ttl_empty_parts(started_cluster): ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/test_ttl_empty_parts', '{replica}') ORDER BY id SETTINGS max_bytes_to_merge_at_min_space_in_pool = 1, max_bytes_to_merge_at_max_space_in_pool = 1, - cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, old_parts_lifetime = 1 + cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, + cleanup_thread_preferred_points_per_iteration=0, old_parts_lifetime = 1 """.format( replica=node.name diff --git a/tests/queries/0_stateless/00626_replace_partition_from_table_zookeeper.sh b/tests/queries/0_stateless/00626_replace_partition_from_table_zookeeper.sh index a0a3416e406..399511db701 100755 --- a/tests/queries/0_stateless/00626_replace_partition_from_table_zookeeper.sh +++ b/tests/queries/0_stateless/00626_replace_partition_from_table_zookeeper.sh @@ -36,8 +36,12 @@ $CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS dst_r1;" $CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS dst_r2;" $CLICKHOUSE_CLIENT --query="CREATE TABLE src (p UInt64, k String, d UInt64) ENGINE = MergeTree PARTITION BY p ORDER BY k;" -$CLICKHOUSE_CLIENT --query="CREATE TABLE dst_r1 (p UInt64, k String, d UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/dst_1', '1') PARTITION BY p ORDER BY k SETTINGS old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=0;" -$CLICKHOUSE_CLIENT --query="CREATE TABLE dst_r2 (p UInt64, k String, d UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/dst_1', '2') PARTITION BY p ORDER BY k SETTINGS old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=0;" +$CLICKHOUSE_CLIENT --query="CREATE TABLE dst_r1 (p UInt64, k String, d UInt64) +ENGINE = ReplicatedMergeTree('/clickhouse/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/dst_1', '1') PARTITION BY p ORDER BY k +SETTINGS old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0;" +$CLICKHOUSE_CLIENT --query="CREATE TABLE dst_r2 (p UInt64, k String, d UInt64) +ENGINE = ReplicatedMergeTree('/clickhouse/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/dst_1', '2') PARTITION BY p ORDER BY k +SETTINGS old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0;" $CLICKHOUSE_CLIENT --query="INSERT INTO src VALUES (0, '0', 1);" $CLICKHOUSE_CLIENT --query="INSERT INTO src VALUES (1, '0', 1);" diff --git a/tests/queries/0_stateless/00652_replicated_mutations_zookeeper.sh b/tests/queries/0_stateless/00652_replicated_mutations_zookeeper.sh index 1f5bcbdc0d0..d8b1bdec328 100755 --- a/tests/queries/0_stateless/00652_replicated_mutations_zookeeper.sh +++ b/tests/queries/0_stateless/00652_replicated_mutations_zookeeper.sh @@ -56,11 +56,13 @@ ${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS mutations_cleaner_r2 SYNC" ${CLICKHOUSE_CLIENT} --query="CREATE TABLE mutations_cleaner_r1(x UInt32) ENGINE ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/mutations_cleaner', 'r1') ORDER BY x SETTINGS \ finished_mutations_to_keep = 2, cleanup_delay_period = 1, - cleanup_delay_period_random_add = 0" + cleanup_delay_period_random_add = 0, + cleanup_thread_preferred_points_per_iteration=0" ${CLICKHOUSE_CLIENT} --query="CREATE TABLE mutations_cleaner_r2(x UInt32) ENGINE ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/mutations_cleaner', 'r2') ORDER BY x SETTINGS \ finished_mutations_to_keep = 2, cleanup_delay_period = 1, - cleanup_delay_period_random_add = 0" + cleanup_delay_period_random_add = 0, + cleanup_thread_preferred_points_per_iteration=0" # Insert some data ${CLICKHOUSE_CLIENT} --insert_keeper_fault_injection_probability=0 --query="INSERT INTO mutations_cleaner_r1(x) VALUES (1), (2), (3), (4), (5)" diff --git a/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sh b/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sh index 5fc3fa460e6..bab2304cec2 100755 --- a/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sh +++ b/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sh @@ -20,13 +20,15 @@ CREATE TABLE part_header_r1(x UInt32, y UInt32) SETTINGS use_minimalistic_part_header_in_zookeeper = 0, old_parts_lifetime = 1, cleanup_delay_period = 0, - cleanup_delay_period_random_add = 0; + cleanup_delay_period_random_add = 0, + cleanup_thread_preferred_points_per_iteration=0; CREATE TABLE part_header_r2(x UInt32, y UInt32) ENGINE ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/test_00814/part_header/{shard}', '2{replica}') ORDER BY x SETTINGS use_minimalistic_part_header_in_zookeeper = 1, old_parts_lifetime = 1, cleanup_delay_period = 0, - cleanup_delay_period_random_add = 0; + cleanup_delay_period_random_add = 0, + cleanup_thread_preferred_points_per_iteration=0; SELECT '*** Test fetches ***'; INSERT INTO part_header_r1 VALUES (1, 1); diff --git a/tests/queries/0_stateless/00953_zookeeper_suetin_deduplication_bug.sh b/tests/queries/0_stateless/00953_zookeeper_suetin_deduplication_bug.sh index c713c7c4926..ad0146b9d99 100755 --- a/tests/queries/0_stateless/00953_zookeeper_suetin_deduplication_bug.sh +++ b/tests/queries/0_stateless/00953_zookeeper_suetin_deduplication_bug.sh @@ -22,7 +22,7 @@ CREATE TABLE elog ( ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/elog/{shard}', '{replica}') PARTITION BY date ORDER BY (engine_id) -SETTINGS replicated_deduplication_window = 2, cleanup_delay_period=4, cleanup_delay_period_random_add=0;" +SETTINGS replicated_deduplication_window = 2, cleanup_delay_period=4, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0;" $CLICKHOUSE_CLIENT --query="INSERT INTO elog VALUES (toDate('2018-10-01'), 1, 'hello')" $CLICKHOUSE_CLIENT --query="INSERT INTO elog VALUES (toDate('2018-10-01'), 2, 'hello')" diff --git a/tests/queries/0_stateless/00988_parallel_parts_removal.sql b/tests/queries/0_stateless/00988_parallel_parts_removal.sql index bff9bbe6d8d..5bd31ba1baa 100644 --- a/tests/queries/0_stateless/00988_parallel_parts_removal.sql +++ b/tests/queries/0_stateless/00988_parallel_parts_removal.sql @@ -1,6 +1,8 @@ DROP TABLE IF EXISTS mt; -CREATE TABLE mt (x UInt64) ENGINE = MergeTree ORDER BY x SETTINGS max_part_removal_threads = 16, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, old_parts_lifetime = 1, parts_to_delay_insert = 100000, parts_to_throw_insert = 100000; +CREATE TABLE mt (x UInt64) ENGINE = MergeTree ORDER BY x + SETTINGS max_part_removal_threads = 16, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, + cleanup_thread_preferred_points_per_iteration=0, old_parts_lifetime = 1, parts_to_delay_insert = 100000, parts_to_throw_insert = 100000; SYSTEM STOP MERGES mt; diff --git a/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh b/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh index 5b1c50262bf..e0b7ab29924 100755 --- a/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh +++ b/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh @@ -13,8 +13,14 @@ $CLICKHOUSE_CLIENT -n -q " DROP TABLE IF EXISTS alter_table0; DROP TABLE IF EXISTS alter_table1; - CREATE TABLE alter_table0 (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r1') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, replicated_max_mutations_in_one_entry = $(($RANDOM / 50 + 100)); - CREATE TABLE alter_table1 (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r2') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, replicated_max_mutations_in_one_entry = $(($RANDOM / 50 + 200)); + CREATE TABLE alter_table0 (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r1') ORDER BY a PARTITION BY b % 10 + SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, + cleanup_thread_preferred_points_per_iteration=0, replicated_max_mutations_in_one_entry = $(($RANDOM / 50 + 100)); + CREATE TABLE alter_table1 (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r2') ORDER BY a PARTITION BY b % 10 + SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, + cleanup_thread_preferred_points_per_iteration=0, replicated_max_mutations_in_one_entry = $(($RANDOM / 50 + 200)); " function thread1() diff --git a/tests/queries/0_stateless/00993_system_parts_race_condition_drop_zookeeper.sh b/tests/queries/0_stateless/00993_system_parts_race_condition_drop_zookeeper.sh index f4f38ad9c83..811681794a5 100755 --- a/tests/queries/0_stateless/00993_system_parts_race_condition_drop_zookeeper.sh +++ b/tests/queries/0_stateless/00993_system_parts_race_condition_drop_zookeeper.sh @@ -58,7 +58,8 @@ function thread6() $CLICKHOUSE_CLIENT -n -q "DROP TABLE IF EXISTS alter_table_$REPLICA; CREATE TABLE alter_table_$REPLICA (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r_$REPLICA') ORDER BY a PARTITION BY b % 10 - SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 0, replicated_max_mutations_in_one_entry = $(($RANDOM / 50));"; + SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 0, + cleanup_thread_preferred_points_per_iteration=0, replicated_max_mutations_in_one_entry = $(($RANDOM / 50));"; sleep 0.$RANDOM; done } diff --git a/tests/queries/0_stateless/01034_move_partition_from_table_zookeeper.sh b/tests/queries/0_stateless/01034_move_partition_from_table_zookeeper.sh index 5e9e69d999d..e0a84323dbd 100755 --- a/tests/queries/0_stateless/01034_move_partition_from_table_zookeeper.sh +++ b/tests/queries/0_stateless/01034_move_partition_from_table_zookeeper.sh @@ -28,7 +28,8 @@ $CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS src;" $CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS dst;" $CLICKHOUSE_CLIENT --query="CREATE TABLE src (p UInt64, k String, d UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/src1', '1') PARTITION BY p ORDER BY k;" -$CLICKHOUSE_CLIENT --query="CREATE TABLE dst (p UInt64, k String, d UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/dst1', '1') PARTITION BY p ORDER BY k SETTINGS old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=0;" +$CLICKHOUSE_CLIENT --query="CREATE TABLE dst (p UInt64, k String, d UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/dst1', '1') PARTITION BY p ORDER BY k +SETTINGS old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0;" $CLICKHOUSE_CLIENT --query="INSERT INTO src VALUES (0, '0', 1);" $CLICKHOUSE_CLIENT --query="INSERT INTO src VALUES (1, '0', 1);" @@ -58,7 +59,8 @@ $CLICKHOUSE_CLIENT --query="DROP TABLE dst;" $CLICKHOUSE_CLIENT --query="SELECT 'MOVE incompatible schema missing column';" $CLICKHOUSE_CLIENT --query="CREATE TABLE src (p UInt64, k String, d UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/src2', '1') PARTITION BY p ORDER BY (d, p);" -$CLICKHOUSE_CLIENT --query="CREATE TABLE dst (p UInt64, d UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/dst2', '1') PARTITION BY p ORDER BY (d, p) SETTINGS old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=0;" +$CLICKHOUSE_CLIENT --query="CREATE TABLE dst (p UInt64, d UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/dst2', '1') PARTITION BY p ORDER BY (d, p) +SETTINGS old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0;" $CLICKHOUSE_CLIENT --query="INSERT INTO src VALUES (0, '0', 1);" $CLICKHOUSE_CLIENT --query="INSERT INTO src VALUES (1, '0', 1);" diff --git a/tests/queries/0_stateless/01035_concurrent_move_partition_from_table_zookeeper.sh b/tests/queries/0_stateless/01035_concurrent_move_partition_from_table_zookeeper.sh index 8ef03be02b6..06a460f3600 100755 --- a/tests/queries/0_stateless/01035_concurrent_move_partition_from_table_zookeeper.sh +++ b/tests/queries/0_stateless/01035_concurrent_move_partition_from_table_zookeeper.sh @@ -11,7 +11,8 @@ $CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS src;" $CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS dst;" $CLICKHOUSE_CLIENT --query="CREATE TABLE src (p UInt64, k String) ENGINE = ReplicatedMergeTree('/clickhouse/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/src', '1') PARTITION BY p ORDER BY k;" -$CLICKHOUSE_CLIENT --query="CREATE TABLE dst (p UInt64, k String) ENGINE = ReplicatedMergeTree('/clickhouse/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/dst', '1') PARTITION BY p ORDER BY k SETTINGS old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=0;" +$CLICKHOUSE_CLIENT --query="CREATE TABLE dst (p UInt64, k String) ENGINE = ReplicatedMergeTree('/clickhouse/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/dst', '1') PARTITION BY p ORDER BY k +SETTINGS old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0;" function thread1() { diff --git a/tests/queries/0_stateless/01076_parallel_alter_replicated_zookeeper.sh b/tests/queries/0_stateless/01076_parallel_alter_replicated_zookeeper.sh index 7f53bf2a627..5f69427c0cd 100755 --- a/tests/queries/0_stateless/01076_parallel_alter_replicated_zookeeper.sh +++ b/tests/queries/0_stateless/01076_parallel_alter_replicated_zookeeper.sh @@ -31,7 +31,8 @@ for i in $(seq $REPLICAS); do max_replicated_merges_in_queue = 1000, temporary_directories_lifetime = 10, cleanup_delay_period = 3, - cleanup_delay_period_random_add = 0" + cleanup_delay_period_random_add = 0, + cleanup_thread_preferred_points_per_iteration=0" done $CLICKHOUSE_CLIENT --query "INSERT INTO concurrent_mutate_mt_1 SELECT number, number + 10, toString(number) from numbers(10)" diff --git a/tests/queries/0_stateless/01079_parallel_alter_detach_table_zookeeper.sh b/tests/queries/0_stateless/01079_parallel_alter_detach_table_zookeeper.sh index aec27792603..e508b77a0c2 100755 --- a/tests/queries/0_stateless/01079_parallel_alter_detach_table_zookeeper.sh +++ b/tests/queries/0_stateless/01079_parallel_alter_detach_table_zookeeper.sh @@ -12,7 +12,10 @@ for i in $(seq $REPLICAS); do done for i in $(seq $REPLICAS); do - $CLICKHOUSE_CLIENT --query "CREATE TABLE concurrent_alter_detach_$i (key UInt64, value1 UInt8, value2 UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/concurrent_alter_detach', '$i') ORDER BY key SETTINGS max_replicated_mutations_in_queue=1000, number_of_free_entries_in_pool_to_execute_mutation=0,max_replicated_merges_in_queue=1000,temporary_directories_lifetime=10,cleanup_delay_period=3,cleanup_delay_period_random_add=0" + $CLICKHOUSE_CLIENT --query "CREATE TABLE concurrent_alter_detach_$i (key UInt64, value1 UInt8, value2 UInt8) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/concurrent_alter_detach', '$i') ORDER BY key + SETTINGS max_replicated_mutations_in_queue=1000, number_of_free_entries_in_pool_to_execute_mutation=0,max_replicated_merges_in_queue=1000, + temporary_directories_lifetime=10,cleanup_delay_period=3,cleanup_delay_period_random_add=0,cleanup_thread_preferred_points_per_iteration=0" done $CLICKHOUSE_CLIENT --query "INSERT INTO concurrent_alter_detach_1 SELECT number, number + 10, number from numbers(10)" diff --git a/tests/queries/0_stateless/01103_optimize_drop_race_zookeeper.sh b/tests/queries/0_stateless/01103_optimize_drop_race_zookeeper.sh index 95f8dfc0377..3461283b5ea 100755 --- a/tests/queries/0_stateless/01103_optimize_drop_race_zookeeper.sh +++ b/tests/queries/0_stateless/01103_optimize_drop_race_zookeeper.sh @@ -27,7 +27,9 @@ function thread3() { while true; do $CLICKHOUSE_CLIENT -n -q "DROP TABLE IF EXISTS concurrent_optimize_table; - CREATE TABLE concurrent_optimize_table (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/concurrent_optimize_table', '1') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 0;"; + CREATE TABLE concurrent_optimize_table (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/concurrent_optimize_table', '1') ORDER BY a PARTITION BY b % 10 + SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 0, cleanup_thread_preferred_points_per_iteration=0;"; sleep 0.$RANDOM; sleep 0.$RANDOM; sleep 0.$RANDOM; diff --git a/tests/queries/0_stateless/01158_zookeeper_log_long.sql b/tests/queries/0_stateless/01158_zookeeper_log_long.sql index 45771494af6..9b5ae7ad7c6 100644 --- a/tests/queries/0_stateless/01158_zookeeper_log_long.sql +++ b/tests/queries/0_stateless/01158_zookeeper_log_long.sql @@ -6,7 +6,7 @@ SET insert_keeper_fault_injection_probability=0; -- disable fault injection; par drop table if exists rmt sync; -- cleanup code will perform extra Exists -- (so the .reference will not match) -create table rmt (n int) engine=ReplicatedMergeTree('/test/01158/{database}/rmt', '1') order by n settings cleanup_delay_period=86400, replicated_can_become_leader=0; +create table rmt (n int) engine=ReplicatedMergeTree('/test/01158/{database}/rmt', '1') order by n settings cleanup_delay_period=86400, max_cleanup_delay_period=86400, replicated_can_become_leader=0; system sync replica rmt; insert into rmt values (1); insert into rmt values (1); diff --git a/tests/queries/0_stateless/01396_inactive_replica_cleanup_nodes_zookeeper.sh b/tests/queries/0_stateless/01396_inactive_replica_cleanup_nodes_zookeeper.sh index 411705e0469..2d761df998e 100755 --- a/tests/queries/0_stateless/01396_inactive_replica_cleanup_nodes_zookeeper.sh +++ b/tests/queries/0_stateless/01396_inactive_replica_cleanup_nodes_zookeeper.sh @@ -13,8 +13,10 @@ SCALE=5000 $CLICKHOUSE_CLIENT -n --query " DROP TABLE IF EXISTS r1; DROP TABLE IF EXISTS r2; - CREATE TABLE r1 (x UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/{shard}', '1{replica}') ORDER BY x SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 1, parts_to_throw_insert = 100000, max_replicated_logs_to_keep = 10; - CREATE TABLE r2 (x UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/{shard}', '2{replica}') ORDER BY x SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 1, parts_to_throw_insert = 100000, max_replicated_logs_to_keep = 10; + CREATE TABLE r1 (x UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/{shard}', '1{replica}') ORDER BY x + SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 1, cleanup_thread_preferred_points_per_iteration=0, parts_to_throw_insert = 100000, max_replicated_logs_to_keep = 10; + CREATE TABLE r2 (x UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/{shard}', '2{replica}') ORDER BY x + SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 1, cleanup_thread_preferred_points_per_iteration=0, parts_to_throw_insert = 100000, max_replicated_logs_to_keep = 10; DETACH TABLE r2; " diff --git a/tests/queries/0_stateless/01508_race_condition_rename_clear_zookeeper_long.sh b/tests/queries/0_stateless/01508_race_condition_rename_clear_zookeeper_long.sh index 80318ba67fb..c3c87eeaf8b 100755 --- a/tests/queries/0_stateless/01508_race_condition_rename_clear_zookeeper_long.sh +++ b/tests/queries/0_stateless/01508_race_condition_rename_clear_zookeeper_long.sh @@ -8,7 +8,9 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS table_for_renames0" $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS table_for_renames50" -$CLICKHOUSE_CLIENT --query "CREATE TABLE table_for_renames0 (value UInt64, data String) ENGINE ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/concurrent_rename', '1') ORDER BY tuple() SETTINGS cleanup_delay_period = 1, cleanup_delay_period_random_add = 0" +$CLICKHOUSE_CLIENT --query "CREATE TABLE table_for_renames0 (value UInt64, data String) +ENGINE ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/concurrent_rename', '1') ORDER BY tuple() +SETTINGS cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, cleanup_thread_preferred_points_per_iteration=0" $CLICKHOUSE_CLIENT --query "INSERT INTO table_for_renames0 SELECT number, toString(number) FROM numbers(1000)" diff --git a/tests/queries/0_stateless/01509_parallel_quorum_and_merge_long.sh b/tests/queries/0_stateless/01509_parallel_quorum_and_merge_long.sh index 445706e35bf..bf88ad0e0b2 100755 --- a/tests/queries/0_stateless/01509_parallel_quorum_and_merge_long.sh +++ b/tests/queries/0_stateless/01509_parallel_quorum_and_merge_long.sh @@ -13,7 +13,8 @@ $CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS parallel_q1 SYNC" $CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS parallel_q2 SYNC" -$CLICKHOUSE_CLIENT -q "CREATE TABLE parallel_q1 (x UInt64) ENGINE=ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/parallel_q', 'r1') ORDER BY tuple() SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 0" +$CLICKHOUSE_CLIENT -q "CREATE TABLE parallel_q1 (x UInt64) ENGINE=ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/parallel_q', 'r1') ORDER BY tuple() +SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 0, cleanup_thread_preferred_points_per_iteration=0" $CLICKHOUSE_CLIENT -q "CREATE TABLE parallel_q2 (x UInt64) ENGINE=ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/parallel_q', 'r2') ORDER BY tuple() SETTINGS always_fetch_merged_part = 1" diff --git a/tests/queries/0_stateless/01921_concurrent_ttl_and_normal_merges_zookeeper_long.sh b/tests/queries/0_stateless/01921_concurrent_ttl_and_normal_merges_zookeeper_long.sh index a3682a3a74b..5e1600a0673 100755 --- a/tests/queries/0_stateless/01921_concurrent_ttl_and_normal_merges_zookeeper_long.sh +++ b/tests/queries/0_stateless/01921_concurrent_ttl_and_normal_merges_zookeeper_long.sh @@ -24,7 +24,8 @@ for i in $(seq 1 $NUM_REPLICAS); do ENGINE ReplicatedMergeTree('/test/01921_concurrent_ttl_and_normal_merges/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/ttl_table', '$i') ORDER BY tuple() TTL key + INTERVAL 1 SECOND - SETTINGS merge_with_ttl_timeout=1, max_replicated_merges_with_ttl_in_queue=100, max_number_of_merges_with_ttl_in_pool=100, cleanup_delay_period=1, cleanup_delay_period_random_add=0;" + SETTINGS merge_with_ttl_timeout=1, max_replicated_merges_with_ttl_in_queue=100, max_number_of_merges_with_ttl_in_pool=100, + cleanup_delay_period=1, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0;" done function optimize_thread diff --git a/tests/queries/0_stateless/02067_lost_part_s3.sql b/tests/queries/0_stateless/02067_lost_part_s3.sql index 12afdcd4421..7df15ab33c4 100644 --- a/tests/queries/0_stateless/02067_lost_part_s3.sql +++ b/tests/queries/0_stateless/02067_lost_part_s3.sql @@ -4,11 +4,17 @@ DROP TABLE IF EXISTS partslost_0; DROP TABLE IF EXISTS partslost_1; DROP TABLE IF EXISTS partslost_2; -CREATE TABLE partslost_0 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/{database}_02067_lost/partslost', '0') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1; +CREATE TABLE partslost_0 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/{database}_02067_lost/partslost', '0') ORDER BY tuple() + SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, + cleanup_delay_period = 1, cleanup_delay_period_random_add = 1, cleanup_thread_preferred_points_per_iteration=0; -CREATE TABLE partslost_1 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/{database}_02067_lost/partslost', '1') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1; +CREATE TABLE partslost_1 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/{database}_02067_lost/partslost', '1') ORDER BY tuple() + SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, + cleanup_delay_period = 1, cleanup_delay_period_random_add = 1, cleanup_thread_preferred_points_per_iteration=0; -CREATE TABLE partslost_2 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/{database}_02067_lost/partslost', '2') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1; +CREATE TABLE partslost_2 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/{database}_02067_lost/partslost', '2') ORDER BY tuple() + SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, + cleanup_delay_period = 1, cleanup_delay_period_random_add = 1, cleanup_thread_preferred_points_per_iteration=0; INSERT INTO partslost_0 SELECT toString(number) AS x from system.numbers LIMIT 10000; diff --git a/tests/queries/0_stateless/02370_lost_part_intersecting_merges.sh b/tests/queries/0_stateless/02370_lost_part_intersecting_merges.sh index bc297cbb963..e34163d0502 100755 --- a/tests/queries/0_stateless/02370_lost_part_intersecting_merges.sh +++ b/tests/queries/0_stateless/02370_lost_part_intersecting_merges.sh @@ -9,7 +9,7 @@ $CLICKHOUSE_CLIENT -q "drop table if exists rmt1 sync;" $CLICKHOUSE_CLIENT -q "drop table if exists rmt2 sync;" $CLICKHOUSE_CLIENT -q "create table rmt1 (n int) engine=ReplicatedMergeTree('/test/02369/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/{database}', '1') order by n - settings cleanup_delay_period=0, cleanup_delay_period_random_add=0, old_parts_lifetime=0" + settings cleanup_delay_period=0, cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0, old_parts_lifetime=0" $CLICKHOUSE_CLIENT -q "create table rmt2 (n int) engine=ReplicatedMergeTree('/test/02369/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/{database}', '2') order by n" $CLICKHOUSE_CLIENT -q "system stop replicated sends rmt2" diff --git a/tests/queries/0_stateless/02396_system_parts_race_condition_rm.sh b/tests/queries/0_stateless/02396_system_parts_race_condition_rm.sh index 5df1a9ba095..e31a091ff45 100755 --- a/tests/queries/0_stateless/02396_system_parts_race_condition_rm.sh +++ b/tests/queries/0_stateless/02396_system_parts_race_condition_rm.sh @@ -15,8 +15,12 @@ $CLICKHOUSE_CLIENT -n -q " DROP TABLE IF EXISTS alter_table0; DROP TABLE IF EXISTS alter_table1; - CREATE TABLE alter_table0 (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r1') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0; - CREATE TABLE alter_table1 (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r2') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0 + CREATE TABLE alter_table0 (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r1') ORDER BY a PARTITION BY b % 10 + SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, cleanup_thread_preferred_points_per_iteration=0; + CREATE TABLE alter_table1 (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r2') ORDER BY a PARTITION BY b % 10 + SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, cleanup_thread_preferred_points_per_iteration=0 " function thread1() diff --git a/tests/queries/0_stateless/02397_system_parts_race_condition_drop_rm.sh b/tests/queries/0_stateless/02397_system_parts_race_condition_drop_rm.sh index 548179b94c9..39e513f6be4 100755 --- a/tests/queries/0_stateless/02397_system_parts_race_condition_drop_rm.sh +++ b/tests/queries/0_stateless/02397_system_parts_race_condition_drop_rm.sh @@ -58,7 +58,9 @@ function thread6() while true; do REPLICA=$(($RANDOM % 10)) $CLICKHOUSE_CLIENT -n -q "DROP TABLE IF EXISTS alter_table_$REPLICA; - CREATE TABLE alter_table_$REPLICA (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r_$REPLICA') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 0;"; + CREATE TABLE alter_table_$REPLICA (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r_$REPLICA') ORDER BY a PARTITION BY b % 10 + SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 0, cleanup_thread_preferred_points_per_iteration=0;"; sleep 0.$RANDOM; done } diff --git a/tests/queries/0_stateless/02432_s3_parallel_parts_cleanup.sql b/tests/queries/0_stateless/02432_s3_parallel_parts_cleanup.sql index 88fb2cdf9b1..bab4bf7881c 100644 --- a/tests/queries/0_stateless/02432_s3_parallel_parts_cleanup.sql +++ b/tests/queries/0_stateless/02432_s3_parallel_parts_cleanup.sql @@ -8,7 +8,7 @@ drop table if exists rmt2; -- Disable compact parts, because we need hardlinks in mutations. create table rmt (n int, m int, k int) engine=ReplicatedMergeTree('/test/02432/{database}', '1') order by tuple() settings storage_policy = 's3_cache', allow_remote_fs_zero_copy_replication=1, - max_part_removal_threads=10, concurrent_part_removal_threshold=1, cleanup_delay_period=1, cleanup_delay_period_random_add=1, + max_part_removal_threads=10, concurrent_part_removal_threshold=1, cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0, max_replicated_merges_in_queue=0, max_replicated_mutations_in_queue=0, min_bytes_for_wide_part=0, min_rows_for_wide_part=0; insert into rmt(n, m) values (1, 42); @@ -38,7 +38,7 @@ select count(), sum(n), sum(m) from rmt; -- New table can assign merges/mutations and can remove old parts create table rmt2 (n int, m int, k String) engine=ReplicatedMergeTree('/test/02432/{database}', '2') order by tuple() settings storage_policy = 's3_cache', allow_remote_fs_zero_copy_replication=1, - max_part_removal_threads=10, concurrent_part_removal_threshold=1, cleanup_delay_period=1, cleanup_delay_period_random_add=1, + max_part_removal_threads=10, concurrent_part_removal_threshold=1, cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0, min_bytes_for_wide_part=0, min_rows_for_wide_part=0, max_replicated_merges_in_queue=1, old_parts_lifetime=0; diff --git a/tests/queries/0_stateless/02448_clone_replica_lost_part.sql b/tests/queries/0_stateless/02448_clone_replica_lost_part.sql index 4befe952a14..44303a1c532 100644 --- a/tests/queries/0_stateless/02448_clone_replica_lost_part.sql +++ b/tests/queries/0_stateless/02448_clone_replica_lost_part.sql @@ -5,9 +5,11 @@ SET insert_keeper_fault_injection_probability=0; -- disable fault injection; par drop table if exists rmt1; drop table if exists rmt2; create table rmt1 (n int) engine=ReplicatedMergeTree('/test/02448/{database}/rmt', '1') order by tuple() - settings min_replicated_logs_to_keep=1, max_replicated_logs_to_keep=2, cleanup_delay_period=0, cleanup_delay_period_random_add=1, old_parts_lifetime=0, max_parts_to_merge_at_once=4; + settings min_replicated_logs_to_keep=1, max_replicated_logs_to_keep=2, cleanup_delay_period=0, cleanup_delay_period_random_add=1, + cleanup_thread_preferred_points_per_iteration=0, old_parts_lifetime=0, max_parts_to_merge_at_once=4; create table rmt2 (n int) engine=ReplicatedMergeTree('/test/02448/{database}/rmt', '2') order by tuple() - settings min_replicated_logs_to_keep=1, max_replicated_logs_to_keep=2, cleanup_delay_period=0, cleanup_delay_period_random_add=1, old_parts_lifetime=0, max_parts_to_merge_at_once=4; + settings min_replicated_logs_to_keep=1, max_replicated_logs_to_keep=2, cleanup_delay_period=0, cleanup_delay_period_random_add=1, + cleanup_thread_preferred_points_per_iteration=0, old_parts_lifetime=0, max_parts_to_merge_at_once=4; -- insert part only on one replica system stop replicated sends rmt1; diff --git a/tests/queries/0_stateless/02494_zero_copy_and_projection_and_mutation_work_together.sql b/tests/queries/0_stateless/02494_zero_copy_and_projection_and_mutation_work_together.sql index 98427874160..b4504a55643 100644 --- a/tests/queries/0_stateless/02494_zero_copy_and_projection_and_mutation_work_together.sql +++ b/tests/queries/0_stateless/02494_zero_copy_and_projection_and_mutation_work_together.sql @@ -24,7 +24,8 @@ CREATE TABLE wikistat1 ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/02494_zero_copy_and_projection', '1') ORDER BY (path, time) -SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 0, allow_remote_fs_zero_copy_replication=1, min_bytes_for_wide_part=0; +SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 0, + cleanup_thread_preferred_points_per_iteration=0, allow_remote_fs_zero_copy_replication=1, min_bytes_for_wide_part=0; CREATE TABLE wikistat2 ( @@ -49,7 +50,8 @@ CREATE TABLE wikistat2 ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/02494_zero_copy_and_projection', '2') ORDER BY (path, time) -SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 0, allow_remote_fs_zero_copy_replication=1, min_bytes_for_wide_part=0; +SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 0, + cleanup_thread_preferred_points_per_iteration=0, allow_remote_fs_zero_copy_replication=1, min_bytes_for_wide_part=0; INSERT INTO wikistat1 SELECT toDateTime('2020-10-01 00:00:00'), 'hello', 'world', '/data/path', 10 from numbers(100); diff --git a/tests/queries/0_stateless/02515_cleanup_async_insert_block_ids.sh b/tests/queries/0_stateless/02515_cleanup_async_insert_block_ids.sh index 458a5e95faa..bc6e7eeb214 100755 --- a/tests/queries/0_stateless/02515_cleanup_async_insert_block_ids.sh +++ b/tests/queries/0_stateless/02515_cleanup_async_insert_block_ids.sh @@ -13,7 +13,7 @@ $CLICKHOUSE_CLIENT -n --query " CREATE TABLE t_async_insert_cleanup ( KeyID UInt32 ) Engine = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/t_async_insert_cleanup', '{replica}') - ORDER BY (KeyID) SETTINGS cleanup_delay_period = 1, cleanup_delay_period_random_add = 1, replicated_deduplication_window_for_async_inserts=10 + ORDER BY (KeyID) SETTINGS cleanup_delay_period = 1, cleanup_delay_period_random_add = 1, cleanup_thread_preferred_points_per_iteration=0, replicated_deduplication_window_for_async_inserts=10 " for i in {1..100}; do From 646eeb63a4cc720b05ff9de48364be32a6936d94 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 22 May 2023 19:46:05 +0000 Subject: [PATCH 0106/1072] Fix build --- src/Functions/FunctionGenerateRandomStructure.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/FunctionGenerateRandomStructure.cpp b/src/Functions/FunctionGenerateRandomStructure.cpp index 16dac4f5112..f85b2596530 100644 --- a/src/Functions/FunctionGenerateRandomStructure.cpp +++ b/src/Functions/FunctionGenerateRandomStructure.cpp @@ -424,7 +424,7 @@ String FunctionGenerateRandomStructure::generateRandomStructure(size_t seed, con REGISTER_FUNCTION(GenerateRandomStructure) { - factory.registerFunction( + factory.registerFunction(FunctionDocumentation { .description=R"( Generates a random table structure. From dbf08b25fb8f569a33dc3a8b05862af9e61eb72a Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 23 May 2023 01:25:17 +0200 Subject: [PATCH 0107/1072] better scheduling of merge selecting task --- src/Storages/MergeTree/MergeTreeSettings.cpp | 24 +++ src/Storages/MergeTree/MergeTreeSettings.h | 4 +- .../ReplicatedMergeTreeCleanupThread.cpp | 2 +- src/Storages/StorageMergeTree.cpp | 3 +- src/Storages/StorageReplicatedMergeTree.cpp | 198 +++++++++++------- src/Storages/StorageReplicatedMergeTree.h | 2 + .../test.py | 3 +- .../test_merge_tree_empty_parts/test.py | 2 +- 8 files changed, 157 insertions(+), 81 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeSettings.cpp b/src/Storages/MergeTree/MergeTreeSettings.cpp index 479e50fdebb..6df841059b9 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.cpp +++ b/src/Storages/MergeTree/MergeTreeSettings.cpp @@ -175,5 +175,29 @@ void MergeTreeSettings::sanityCheck(size_t background_pool_tasks) const min_bytes_to_rebalance_partition_over_jbod, max_bytes_to_merge_at_max_space_in_pool / 1024); } + + if (max_cleanup_delay_period < cleanup_delay_period) + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "The value of max_cleanup_delay_period setting ({}) must be greater than the value of cleanup_delay_period setting ({})", + max_cleanup_delay_period, cleanup_delay_period); + } + + if (max_merge_selecting_sleep_ms < merge_selecting_sleep_ms) + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "The value of max_merge_selecting_sleep_ms setting ({}) must be greater than the value of merge_selecting_sleep_ms setting ({})", + max_merge_selecting_sleep_ms, merge_selecting_sleep_ms); + } + + if (merge_selecting_sleep_slowdown_factor < 1.f) + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "The value of merge_selecting_sleep_slowdown_factor setting ({}) cannot be less than 1.0", + merge_selecting_sleep_slowdown_factor); + } } } diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 78d703e795c..56860342038 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -57,7 +57,9 @@ struct Settings; M(Bool, fsync_part_directory, false, "Do fsync for part directory after all part operations (writes, renames, etc.).", 0) \ M(UInt64, non_replicated_deduplication_window, 0, "How many last blocks of hashes should be kept on disk (0 - disabled).", 0) \ M(UInt64, max_parts_to_merge_at_once, 100, "Max amount of parts which can be merged at once (0 - disabled). Doesn't affect OPTIMIZE FINAL query.", 0) \ - M(UInt64, merge_selecting_sleep_ms, 5000, "Sleep time for merge selecting when no part selected, a lower setting will trigger selecting tasks in background_schedule_pool frequently which result in large amount of requests to zookeeper in large-scale clusters", 0) \ + M(UInt64, merge_selecting_sleep_ms, 5000, "Maximum sleep time for merge selecting, a lower setting will trigger selecting tasks in background_schedule_pool frequently which result in large amount of requests to zookeeper in large-scale clusters", 0) \ + M(UInt64, max_merge_selecting_sleep_ms, 60000, "Maximum sleep time for merge selecting, a lower setting will trigger selecting tasks in background_schedule_pool frequently which result in large amount of requests to zookeeper in large-scale clusters", 0) \ + M(Float, merge_selecting_sleep_slowdown_factor, 1.2f, "The sleep time for merge selecting task is multiplied by this factor when there's nothing to merge and divided when a merge was assigned", 0) \ M(UInt64, merge_tree_clear_old_temporary_directories_interval_seconds, 60, "The period of executing the clear old temporary directories operation in background.", 0) \ M(UInt64, merge_tree_clear_old_parts_interval_seconds, 1, "The period of executing the clear old parts operation in background.", 0) \ M(UInt64, merge_tree_clear_old_broken_detached_parts_ttl_timeout_seconds, 1ULL * 3600 * 24 * 30, "Remove old broken detached parts in the background if they remained intouched for a specified by this setting period of time.", 0) \ diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp index 35a860ebb42..bcc4dc749fb 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp @@ -108,7 +108,7 @@ void ReplicatedMergeTreeCleanupThread::wakeupEarlierIfNeeded() return; /// Do not re-check all parts too often (avoid constantly calling getNumberOfOutdatedPartsWithExpiredRemovalTime()) - if (!wakeup_check_timer.compareAndRestart(storage_settings->cleanup_delay_period / 4)) + if (!wakeup_check_timer.compareAndRestart(storage_settings->cleanup_delay_period / 4.0)) return; UInt64 prev_run_timestamp_ms = prev_cleanup_timestamp_ms.load(std::memory_order_relaxed); diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 2c19d3ba122..cb8b78b4e0a 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -1298,8 +1298,7 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assign /// which is equal or more fresh than commands themselves. In extremely rare case it can happen that we will have alter /// in between we took snapshot above and selected commands. That is why we take new snapshot here. auto task = std::make_shared(*this, getInMemoryMetadataPtr(), mutate_entry, shared_lock, common_assignee_trigger); - assignee.scheduleMergeMutateTask(task); - return true; + return assignee.scheduleMergeMutateTask(task); } if (has_mutations) { diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 0698ab7bf38..a6152c22148 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -324,6 +325,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( /// Will be activated if we will achieve leader state. merge_selecting_task->deactivate(); + merge_selecting_sleep_ms = getSettings()->merge_selecting_sleep_ms; mutations_finalizing_task = getContext()->getSchedulePool().createTask( getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::mutationsFinalizingTask)", [this] { mutationsFinalizingTask(); }); @@ -414,6 +416,19 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( loadDataParts(skip_sanity_checks); + if (attach) + { + /// Provide better initial value of merge_selecting_sleep_ms on server startup + auto settings = getSettings(); + size_t max_parts_in_partition = getMaxPartsCountAndSizeForPartition().first; + if (settings->parts_to_delay_insert && max_parts_in_partition < settings->parts_to_delay_insert) + { + Float64 ratio = 1.0 - static_cast(max_parts_in_partition) / settings->parts_to_delay_insert; + merge_selecting_sleep_ms = static_cast(interpolateLinear(settings->merge_selecting_sleep_ms, + settings->max_merge_selecting_sleep_ms, ratio)); + } + } + if (!current_zookeeper) { if (!attach) @@ -3237,7 +3252,15 @@ void StorageReplicatedMergeTree::mergeSelectingTask() const bool cleanup = (storage_settings_ptr->clean_deleted_rows != CleanDeletedRows::Never); CreateMergeEntryResult create_result = CreateMergeEntryResult::Other; - try + enum class AttemptStatus + { + EntryCreated, + NeedRetry, + Limited, + CannotSelect, + }; + + auto try_assign_merge = [&]() -> AttemptStatus { /// We must select parts for merge under merge_selecting_mutex because other threads /// (OPTIMIZE queries) can assign new merges. @@ -3259,108 +3282,133 @@ void StorageReplicatedMergeTree::mergeSelectingTask() "Current background tasks memory usage: {}.", formatReadableSizeWithBinarySuffix(background_memory_tracker.getSoftLimit()), formatReadableSizeWithBinarySuffix(background_memory_tracker.get())); + return AttemptStatus::Limited; } - else if (merges_and_mutations_sum >= storage_settings_ptr->max_replicated_merges_in_queue) + + if (merges_and_mutations_sum >= storage_settings_ptr->max_replicated_merges_in_queue) { LOG_TRACE(log, "Number of queued merges ({}) and part mutations ({})" " is greater than max_replicated_merges_in_queue ({}), so won't select new parts to merge or mutate.", merges_and_mutations_queued.merges, merges_and_mutations_queued.mutations, storage_settings_ptr->max_replicated_merges_in_queue); + return AttemptStatus::Limited; } - else + + UInt64 max_source_parts_size_for_merge = merger_mutator.getMaxSourcePartsSizeForMerge( + storage_settings_ptr->max_replicated_merges_in_queue, merges_and_mutations_sum); + + UInt64 max_source_part_size_for_mutation = merger_mutator.getMaxSourcePartSizeForMutation(); + + bool merge_with_ttl_allowed = merges_and_mutations_queued.merges_with_ttl < storage_settings_ptr->max_replicated_merges_with_ttl_in_queue && + getTotalMergesWithTTLInMergeList() < storage_settings_ptr->max_number_of_merges_with_ttl_in_pool; + + auto future_merged_part = std::make_shared(); + if (storage_settings.get()->assign_part_uuids) + future_merged_part->uuid = UUIDHelpers::generateV4(); + + bool can_assign_merge = max_source_parts_size_for_merge > 0; + PartitionIdsHint partitions_to_merge_in; + if (can_assign_merge) { - UInt64 max_source_parts_size_for_merge = merger_mutator.getMaxSourcePartsSizeForMerge( - storage_settings_ptr->max_replicated_merges_in_queue, merges_and_mutations_sum); + auto lightweight_merge_pred = LocalMergePredicate(queue); + partitions_to_merge_in = merger_mutator.getPartitionsThatMayBeMerged( + max_source_parts_size_for_merge, lightweight_merge_pred, merge_with_ttl_allowed, NO_TRANSACTION_PTR); + if (partitions_to_merge_in.empty()) + can_assign_merge = false; + else + merge_pred.emplace(queue.getMergePredicate(zookeeper, partitions_to_merge_in)); + } - UInt64 max_source_part_size_for_mutation = merger_mutator.getMaxSourcePartSizeForMutation(); + if (can_assign_merge && + merger_mutator.selectPartsToMerge(future_merged_part, false, max_source_parts_size_for_merge, *merge_pred, + merge_with_ttl_allowed, NO_TRANSACTION_PTR, nullptr, &partitions_to_merge_in) == SelectPartsDecision::SELECTED) + { + create_result = createLogEntryToMergeParts( + zookeeper, + future_merged_part->parts, + future_merged_part->name, + future_merged_part->uuid, + future_merged_part->part_format, + deduplicate, + deduplicate_by_columns, + cleanup, + nullptr, + merge_pred->getVersion(), + future_merged_part->merge_type); - bool merge_with_ttl_allowed = merges_and_mutations_queued.merges_with_ttl < storage_settings_ptr->max_replicated_merges_with_ttl_in_queue && - getTotalMergesWithTTLInMergeList() < storage_settings_ptr->max_number_of_merges_with_ttl_in_pool; - auto future_merged_part = std::make_shared(); - if (storage_settings.get()->assign_part_uuids) - future_merged_part->uuid = UUIDHelpers::generateV4(); + if (create_result == CreateMergeEntryResult::Ok) + return AttemptStatus::EntryCreated; + if (create_result == CreateMergeEntryResult::LogUpdated) + return AttemptStatus::NeedRetry; + } - bool can_assign_merge = max_source_parts_size_for_merge > 0; - PartitionIdsHint partitions_to_merge_in; - if (can_assign_merge) + /// If there are many mutations in queue, it may happen, that we cannot enqueue enough merges to merge all new parts + if (max_source_part_size_for_mutation == 0 || merges_and_mutations_queued.mutations >= storage_settings_ptr->max_replicated_mutations_in_queue) + return AttemptStatus::Limited; + + if (queue.countMutations() > 0) + { + /// We don't need the list of committing blocks to choose a part to mutate + if (!merge_pred) + merge_pred.emplace(queue.getMergePredicate(zookeeper, PartitionIdsHint{})); + + /// Choose a part to mutate. + DataPartsVector data_parts = getDataPartsVectorForInternalUsage(); + for (const auto & part : data_parts) { - auto lightweight_merge_pred = LocalMergePredicate(queue); - partitions_to_merge_in = merger_mutator.getPartitionsThatMayBeMerged( - max_source_parts_size_for_merge, lightweight_merge_pred, merge_with_ttl_allowed, NO_TRANSACTION_PTR); - if (partitions_to_merge_in.empty()) - can_assign_merge = false; - else - merge_pred.emplace(queue.getMergePredicate(zookeeper, partitions_to_merge_in)); - } + if (part->getBytesOnDisk() > max_source_part_size_for_mutation) + continue; - if (can_assign_merge && - merger_mutator.selectPartsToMerge(future_merged_part, false, max_source_parts_size_for_merge, *merge_pred, - merge_with_ttl_allowed, NO_TRANSACTION_PTR, nullptr, &partitions_to_merge_in) == SelectPartsDecision::SELECTED) - { - create_result = createLogEntryToMergeParts( - zookeeper, - future_merged_part->parts, - future_merged_part->name, + std::optional> desired_mutation_version = merge_pred->getDesiredMutationVersion(part); + if (!desired_mutation_version) + continue; + + create_result = createLogEntryToMutatePart( + *part, future_merged_part->uuid, - future_merged_part->part_format, - deduplicate, - deduplicate_by_columns, - cleanup, - nullptr, - merge_pred->getVersion(), - future_merged_part->merge_type); - } - /// If there are many mutations in queue, it may happen, that we cannot enqueue enough merges to merge all new parts - else if (max_source_part_size_for_mutation > 0 && queue.countMutations() > 0 - && merges_and_mutations_queued.mutations < storage_settings_ptr->max_replicated_mutations_in_queue) - { - /// We don't need the list of committing blocks to choose a part to mutate - if (!merge_pred) - merge_pred.emplace(queue.getMergePredicate(zookeeper, PartitionIdsHint{})); + desired_mutation_version->first, + desired_mutation_version->second, + merge_pred->getVersion()); - /// Choose a part to mutate. - DataPartsVector data_parts = getDataPartsVectorForInternalUsage(); - for (const auto & part : data_parts) - { - if (part->getBytesOnDisk() > max_source_part_size_for_mutation) - continue; - - std::optional> desired_mutation_version = merge_pred->getDesiredMutationVersion(part); - if (!desired_mutation_version) - continue; - - create_result = createLogEntryToMutatePart( - *part, - future_merged_part->uuid, - desired_mutation_version->first, - desired_mutation_version->second, - merge_pred->getVersion()); - - if (create_result == CreateMergeEntryResult::Ok || - create_result == CreateMergeEntryResult::LogUpdated) - break; - } + if (create_result == CreateMergeEntryResult::Ok) + return AttemptStatus::EntryCreated; + if (create_result == CreateMergeEntryResult::LogUpdated) + return AttemptStatus::NeedRetry; } } + + return AttemptStatus::CannotSelect; + }; + + AttemptStatus result = AttemptStatus::CannotSelect; + try + { + result = try_assign_merge(); } catch (...) { tryLogCurrentException(log, __PRETTY_FUNCTION__); } - if (!is_leader) - return; - if (create_result != CreateMergeEntryResult::Ok - && create_result != CreateMergeEntryResult::LogUpdated) - { - merge_selecting_task->scheduleAfter(storage_settings_ptr->merge_selecting_sleep_ms); - } + if (result == AttemptStatus::EntryCreated || result == AttemptStatus::NeedRetry) + merge_selecting_sleep_ms = static_cast(merge_selecting_sleep_ms / storage_settings_ptr->merge_selecting_sleep_slowdown_factor); + else if (result == AttemptStatus::CannotSelect) + merge_selecting_sleep_ms = static_cast(merge_selecting_sleep_ms * storage_settings_ptr->merge_selecting_sleep_slowdown_factor); + + if (merge_selecting_sleep_ms < storage_settings_ptr->merge_selecting_sleep_ms) + merge_selecting_sleep_ms = storage_settings_ptr->merge_selecting_sleep_ms; + if (merge_selecting_sleep_ms > storage_settings_ptr->max_merge_selecting_sleep_ms) + merge_selecting_sleep_ms = storage_settings_ptr->max_merge_selecting_sleep_ms; + + if (result == AttemptStatus::EntryCreated) + merge_selecting_task->schedule(); else { - merge_selecting_task->schedule(); + LOG_TRACE(log, "Scheduling next merge selecting task after {}ms", merge_selecting_sleep_ms); + merge_selecting_task->scheduleAfter(merge_selecting_sleep_ms); } } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 01b86dd1425..5d877e4b7fa 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -456,6 +456,8 @@ private: /// It is acquired for each iteration of the selection of parts to merge or each OPTIMIZE query. std::mutex merge_selecting_mutex; + UInt64 merge_selecting_sleep_ms; + /// A task that marks finished mutations as done. BackgroundSchedulePool::TaskHolder mutations_finalizing_task; diff --git a/tests/integration/test_consistent_parts_after_clone_replica/test.py b/tests/integration/test_consistent_parts_after_clone_replica/test.py index 0c907340090..2771a874d68 100644 --- a/tests/integration/test_consistent_parts_after_clone_replica/test.py +++ b/tests/integration/test_consistent_parts_after_clone_replica/test.py @@ -13,7 +13,8 @@ def fill_nodes(nodes, shard): CREATE TABLE test_table(date Date, id UInt32) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test{shard}/replicated', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) - SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0; + SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, + cleanup_delay_period_random_add=0, cleanup_thread_preferred_points_per_iteration=0; """.format( shard=shard, replica=node.name ) diff --git a/tests/integration/test_merge_tree_empty_parts/test.py b/tests/integration/test_merge_tree_empty_parts/test.py index 0f611408a67..212c0577c13 100644 --- a/tests/integration/test_merge_tree_empty_parts/test.py +++ b/tests/integration/test_merge_tree_empty_parts/test.py @@ -27,7 +27,7 @@ def test_empty_parts_alter_delete(started_cluster): "CREATE TABLE empty_parts_delete (d Date, key UInt64, value String) " "ENGINE = ReplicatedMergeTree('/clickhouse/tables/empty_parts_delete', 'r1') " "PARTITION BY toYYYYMM(d) ORDER BY key " - "SETTINGS old_parts_lifetime = 1" + "SETTINGS old_parts_lifetime = 1, cleanup_delay_period=0, cleanup_thread_preferred_points_per_iteration=0" ) node1.query("INSERT INTO empty_parts_delete VALUES (toDate('2020-10-10'), 1, 'a')") From c9aa3042b50ae1b691149ec9012c1521b01705ac Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 23 May 2023 02:28:23 +0200 Subject: [PATCH 0108/1072] fix --- .../02427_mutate_and_zero_copy_replication_zookeeper.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02427_mutate_and_zero_copy_replication_zookeeper.sql b/tests/queries/0_stateless/02427_mutate_and_zero_copy_replication_zookeeper.sql index 9b0a52b8dbd..e7e0f2f6c59 100644 --- a/tests/queries/0_stateless/02427_mutate_and_zero_copy_replication_zookeeper.sql +++ b/tests/queries/0_stateless/02427_mutate_and_zero_copy_replication_zookeeper.sql @@ -9,7 +9,7 @@ CREATE TABLE mutate_and_zero_copy_replication1 ) ENGINE ReplicatedMergeTree('/clickhouse/tables/{database}/test_02427_mutate_and_zero_copy_replication/alter', '1') ORDER BY tuple() -SETTINGS old_parts_lifetime=0, cleanup_delay_period=300, cleanup_delay_period_random_add=300, min_bytes_for_wide_part = 0; +SETTINGS old_parts_lifetime=0, cleanup_delay_period=300, max_cleanup_delay_period=300, cleanup_delay_period_random_add=300, min_bytes_for_wide_part = 0; CREATE TABLE mutate_and_zero_copy_replication2 ( From c0bc75eacd0624e38d2a1581e19906778ea8b676 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 23 May 2023 09:50:34 +0000 Subject: [PATCH 0109/1072] Try to fix test --- .../02586_generate_random_structure.reference | 26 ++++++++++--------- .../02586_generate_random_structure.sql | 10 +++---- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/tests/queries/0_stateless/02586_generate_random_structure.reference b/tests/queries/0_stateless/02586_generate_random_structure.reference index e6e2c73ad87..d2929fb4564 100644 --- a/tests/queries/0_stateless/02586_generate_random_structure.reference +++ b/tests/queries/0_stateless/02586_generate_random_structure.reference @@ -2,16 +2,18 @@ c1 String, c2 UInt256, c3 String, c4 Decimal128(8), c5 UInt128 String Const(String) ` 90465455320735604871982424534384518837533904778028808627865442405232847164685 5& -303477100882544888461471906106.82821046 75820566154622566322847299106656624693 -c1 Int128 -c2 Decimal(76, 55) -c3 Int256 -c4 UInt32 -c5 UInt256 -c6 Float64 -c7 Map(DateTime, Int128) -c8 IPv6 -c9 Date32 --77422512305044606600216318673365695785 -178276798036269625488.0220515892112491429818466547307797481494678716313923193 36368120999598076422295038178490120194013353260138466872851513353522727275230 2299358810 12701207658267404852859640589581641341858007661085134086312689265075880787713 -9.78063876538428e-227 {'2063-09-16 00:40:36':127575633389498667752072479581409788016,'2052-11-08 23:07:13':-53938349319031918752329205601430421582,'2104-07-17 19:59:35':84394139582225600743319968813775553298,'2034-11-07 21:10:22':151550220355687100498925996413330909596} 328a:eccb:530f:23c3:275d:7eec:2b1b:9c29 2112-05-13 --77422512305044606600216318673365695785 -178276798036269625488.0220515892112491429818466547307797481494678716313923193 36368120999598076422295038178490120194013353260138466872851513353522727275230 2299358810 12701207658267404852859640589581641341858007661085134086312689265075880787713 -9.78063876538428e-227 {'2063-09-16 00:40:36':127575633389498667752072479581409788016,'2052-11-08 23:07:13':-53938349319031918752329205601430421582,'2104-07-17 19:59:35':84394139582225600743319968813775553298,'2034-11-07 21:10:22':151550220355687100498925996413330909596} 328a:eccb:530f:23c3:275d:7eec:2b1b:9c29 2112-05-13 --77422512305044606600216318673365695785 -178276798036269625488.0220515892112491429818466547307797481494678716313923193 36368120999598076422295038178490120194013353260138466872851513353522727275230 2299358810 12701207658267404852859640589581641341858007661085134086312689265075880787713 -9.78063876538428e-227 {'2063-09-16 00:40:36':166979754159728572703419507823025932071} 8eff:8d3c:5a2c:fa5f:b2bf:2b0e:ff23:beb2 2143-03-03 +c1 FixedString(125) +c2 IPv4 +c3.e1 Array(Enum16(\'e1V3\' = -24827, \'e1V14\' = -24479, \'e1V8\' = -22478, \'e1V10\' = -13735, \'e1V15\' = -12641, \'e1V11\' = -10191, \'e1V0\' = -8579, \'e1V7\' = -8104, \'e1V6\' = 712, \'e1V12\' = 5683, \'e1V13\' = 13678, \'e1V9\' = 19740, \'e1V5\' = 23066, \'e1V2\' = 23292, \'e1V4\' = 23736, \'e1V1\' = 31672)) +c3.e2 Array(Map(Int8, Int32)) +c3.e3 Array(Decimal(76, 64)) +c3.e4 Array(Int32) +c3.e5 Array(Nullable(Int64)) +c3.e6 Array(Int256) +c4 FixedString(183) +c5 IPv4 +c6 UInt256 +Tb#yV[>M*ܨ(OR8V1n)H}C\'I7tqnV)䳆qLPoRg<{3iH_m!q\'G 127.48.9.45 ['e1V10','e1V0','e1V10','e1V14','e1V10','e1V14'] [{-13:777622572,102:-1122882357,62:1647813163,-94:2094022166},{-32:1448633509},{},{},{34:1536340393,19:-2049677851,74:65643868,-46:-1990799930,97:-531041081,46:-2634833,14:1581632600,89:-771229823,-105:1238603584},{47:1458809010,109:1640682510,86:1945730198,85:1505847247,35:-35189402}] [153363749503.3642648494826450951141750747382772821825909005880434540971999557,79828591186.7378041015337066268618633118713347614941338787453473118807106292,81672688565.9633830721322966111551266731935181670389237071708068971548883315,573768486971.1812413548839655834002608768736215115033958693122764224003897029,-393925092368.4893467278351090742501814120269109477445490969167853713051140487,46027399426.0865278566391382610843315130162915324295037009704113636499519839] [755855942,1804001770,-78103159,-866181765,731736602,-79599206] [5253556148991564114,4681434929596395351,-7302160004580855709,-3686747220178471318,6288582051009949273,646864891160092871] [17035203905051045016266537043565487029724162173062647021612805252288722534904,-42105881403933504641593145676742477006499618886131028341247993701618141933523,45346626822580305846120377917274679004279343244238782744860626882886217433843,-3660165069803677989574889324494857545543653453780976182221584349306428201647,-23316760935816288837287058499520670431785615691220162210524162590241529297823,6184785563808848524970564618169964412151721224362412457508264894603779018817] ڡ|A"x>rwzZ:j8tZD"Tu2h!WIytPa|\'yofFO\0Ֆ6\fIrESacW<~e lT>P3})w%4@_2N"ІXp$^ҘͰ\04@n\b\r4H 16.177.117.209 7882774382721411359365561736453116698030365959050344381263687375357052837130 +Tb#yV[>M*ܨ(OR8V1n)H}C\'I7tqnV)䳆qLPoRg<{3iH_m!q\'G 127.48.9.45 ['e1V10','e1V0','e1V10','e1V14','e1V10','e1V14'] [{-13:777622572,102:-1122882357,62:1647813163,-94:2094022166},{-32:1448633509},{},{},{34:1536340393,19:-2049677851,74:65643868,-46:-1990799930,97:-531041081,46:-2634833,14:1581632600,89:-771229823,-105:1238603584},{47:1458809010,109:1640682510,86:1945730198,85:1505847247,35:-35189402}] [153363749503.3642648494826450951141750747382772821825909005880434540971999557,79828591186.7378041015337066268618633118713347614941338787453473118807106292,81672688565.9633830721322966111551266731935181670389237071708068971548883315,573768486971.1812413548839655834002608768736215115033958693122764224003897029,-393925092368.4893467278351090742501814120269109477445490969167853713051140487,46027399426.0865278566391382610843315130162915324295037009704113636499519839] [755855942,1804001770,-78103159,-866181765,731736602,-79599206] [5253556148991564114,4681434929596395351,-7302160004580855709,-3686747220178471318,6288582051009949273,646864891160092871] [17035203905051045016266537043565487029724162173062647021612805252288722534904,-42105881403933504641593145676742477006499618886131028341247993701618141933523,45346626822580305846120377917274679004279343244238782744860626882886217433843,-3660165069803677989574889324494857545543653453780976182221584349306428201647,-23316760935816288837287058499520670431785615691220162210524162590241529297823,6184785563808848524970564618169964412151721224362412457508264894603779018817] ڡ|A"x>rwzZ:j8tZD"Tu2h!WIytPa|\'yofFO\0Ֆ6\fIrESacW<~e lT>P3})w%4@_2N"ІXp$^ҘͰ\04@n\b\r4H 16.177.117.209 7882774382721411359365561736453116698030365959050344381263687375357052837130 +Tb#yV[>M*ܨ(OR8V1n)H}C\'I7tqnV)䳆qLPoRg<{3iH_m!q\'G 127.48.9.45 ['e1V10'] [{}] [825002272867.1157788721157301271303736024856710948164507982705676578804195475] [1865150610] [7514464811443271056] [33504961604882608369857530219353040639899064613284394558131808339620328539033] ڡ|A"x>rwzZ:j8tZD"Tu2h!WIytPa|\'yofFO\0Ֆ6\fIrESacW<~e lT>P3})w%4@_2N"ІXp$^ҘͰ\04@n\b\r4H 16.177.117.209 7882774382721411359365561736453116698030365959050344381263687375357052837130 c1 LowCardinality(Nullable(UInt64)), c2 Date32, c3 LowCardinality(Nullable(Float64)), c4 Int256, c5 Date32 diff --git a/tests/queries/0_stateless/02586_generate_random_structure.sql b/tests/queries/0_stateless/02586_generate_random_structure.sql index b9cec1a436a..e2e8409b35c 100644 --- a/tests/queries/0_stateless/02586_generate_random_structure.sql +++ b/tests/queries/0_stateless/02586_generate_random_structure.sql @@ -9,11 +9,11 @@ select generateRandomStructure(5, '42'); -- {serverError ILLEGAL_TYPE_OF_ARGUMEN select generateRandomStructure(materialize(5), 42); -- {serverError ILLEGAL_COLUMN} select generateRandomStructure(5, materialize(42)); -- {serverError ILLEGAL_COLUMN} -desc generateRandom(11); -select * from generateRandom(11) limit 1; -select * from generateRandom(11, 2) limit 1; -select * from generateRandom(11, 2, 2) limit 1; -select * from generateRandom(11, 2, 2, 2) limit 1; -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +desc generateRandom(10000000); +select * from generateRandom(10000000) limit 1; +select * from generateRandom(10000000, 2) limit 1; +select * from generateRandom(10000000, 2, 2) limit 1; +select * from generateRandom(10000000, 2, 2, 2) limit 1; -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} set allow_suspicious_low_cardinality_types=1; select generateRandomStructure(5, 4); From 0fb9e63f76323ae60520df37e9a947c420664de9 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Tue, 23 May 2023 10:00:00 +0000 Subject: [PATCH 0110/1072] Fix and update broken_tests --- tests/broken_tests.txt | 2 -- tests/queries/0_stateless/01655_plan_optimizations.reference | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index 0b4efacba0b..fc60b820f93 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -59,7 +59,6 @@ 01615_random_one_shard_insertion 01624_soft_constraints 01651_bugs_from_15889 -01655_plan_optimizations 01656_test_query_log_factories_info 01681_bloom_filter_nullable_column 01700_system_zookeeper_path_in @@ -101,7 +100,6 @@ 02354_annoy 02366_union_decimal_conversion 02375_rocksdb_with_filters -02377_optimize_sorting_by_input_stream_properties_explain 02382_join_and_filtering_set 02402_merge_engine_with_view 02404_memory_bound_merging diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index 9796d2e4f82..34ea2bc20a3 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -1,4 +1,5 @@ Too many optimizations applied to query plan +Too many optimizations applied to query plan > sipHash should be calculated after filtration FUNCTION sipHash64 Filter column: equals From 84a97ca04a0f22becab1459bb1e557fe1a6104a8 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 23 May 2023 12:18:41 +0200 Subject: [PATCH 0111/1072] fix --- .../02427_mutate_and_zero_copy_replication_zookeeper.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02427_mutate_and_zero_copy_replication_zookeeper.sql b/tests/queries/0_stateless/02427_mutate_and_zero_copy_replication_zookeeper.sql index e7e0f2f6c59..e3c8583ccf4 100644 --- a/tests/queries/0_stateless/02427_mutate_and_zero_copy_replication_zookeeper.sql +++ b/tests/queries/0_stateless/02427_mutate_and_zero_copy_replication_zookeeper.sql @@ -19,7 +19,7 @@ CREATE TABLE mutate_and_zero_copy_replication2 ) ENGINE ReplicatedMergeTree('/clickhouse/tables/{database}/test_02427_mutate_and_zero_copy_replication/alter', '2') ORDER BY tuple() -SETTINGS old_parts_lifetime=0, cleanup_delay_period=300, cleanup_delay_period_random_add=300; +SETTINGS old_parts_lifetime=0, cleanup_delay_period=300, max_cleanup_delay_period=300, cleanup_delay_period_random_add=300; INSERT INTO mutate_and_zero_copy_replication1 VALUES (1, '1', 1.0); From 3c1aeaaa793f080e20005bfa42a18746f969ad39 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 23 May 2023 11:39:40 +0000 Subject: [PATCH 0112/1072] Change default value of handshake_timeout to 10 sec, fix possible use-after-free --- docs/en/operations/settings/settings.md | 2 +- src/Client/Connection.cpp | 4 ++++ src/Core/Settings.h | 2 +- src/IO/TimeoutSetter.cpp | 21 +++++++++++++------ src/IO/TimeoutSetter.h | 6 +++++- .../test_reload_clusters_config/test.py | 2 +- .../test.py | 4 ++-- 7 files changed, 29 insertions(+), 12 deletions(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index b6ade78b1e1..0810b642039 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1054,7 +1054,7 @@ Default value: 10, 300, 300. Timeout in milliseconds for receiving Hello packet from replicas during handshake. -Default value: 300000. +Default value: 10000. ## cancel_http_readonly_queries_on_client_close {#cancel-http-readonly-queries-on-client-close} diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 09145bcdf1b..35423012424 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -361,6 +361,10 @@ void Connection::receiveHello(const Poco::Timespan & handshake_timeout) receiveException()->rethrow(); else { + /// Reset timeout_setter before disconnect, + /// because after disconnect socket will be invalid. + timeout_setter.reset(); + /// Close connection, to not stay in unsynchronised state. disconnect(); throwUnexpectedPacket(packet_type, "Hello or Exception"); diff --git a/src/Core/Settings.h b/src/Core/Settings.h index ba3a28af614..03cf32e5aaa 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -55,7 +55,7 @@ class IColumn; M(UInt64, max_query_size, DBMS_DEFAULT_MAX_QUERY_SIZE, "The maximum number of bytes of a query string parsed by the SQL parser. Data in the VALUES clause of INSERT queries is processed by a separate stream parser (that consumes O(1) RAM) and not affected by this restriction.", 0) \ M(UInt64, interactive_delay, 100000, "The interval in microseconds to check if the request is cancelled, and to send progress info.", 0) \ M(Seconds, connect_timeout, DBMS_DEFAULT_CONNECT_TIMEOUT_SEC, "Connection timeout if there are no replicas.", 0) \ - M(Milliseconds, handshake_timeout_ms, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC * 1000, "Timeout for receiving HELLO packet from replicas.", 0) \ + M(Milliseconds, handshake_timeout_ms, 10000, "Timeout for receiving HELLO packet from replicas.", 0) \ M(Milliseconds, connect_timeout_with_failover_ms, 1000, "Connection timeout for selecting first healthy replica.", 0) \ M(Milliseconds, connect_timeout_with_failover_secure_ms, 1000, "Connection timeout for selecting first healthy replica (for secure connections).", 0) \ M(Seconds, receive_timeout, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, "Timeout for receiving data from network, in seconds. If no bytes were received in this interval, exception is thrown. If you set this setting on client, the 'send_timeout' for the socket will be also set on the corresponding connection end on the server.", 0) \ diff --git a/src/IO/TimeoutSetter.cpp b/src/IO/TimeoutSetter.cpp index ed21383ccd4..b8b7a814703 100644 --- a/src/IO/TimeoutSetter.cpp +++ b/src/IO/TimeoutSetter.cpp @@ -29,14 +29,12 @@ TimeoutSetter::TimeoutSetter(Poco::Net::StreamSocket & socket_, Poco::Timespan t TimeoutSetter::~TimeoutSetter() { + if (was_reset) + return; + try { - bool connected = socket.impl()->initialized(); - if (!connected) - return; - - socket.setSendTimeout(old_send_timeout); - socket.setReceiveTimeout(old_receive_timeout); + reset(); } catch (...) { @@ -44,4 +42,15 @@ TimeoutSetter::~TimeoutSetter() } } +void TimeoutSetter::reset() +{ + bool connected = socket.impl()->initialized(); + if (!connected) + return; + + socket.setSendTimeout(old_send_timeout); + socket.setReceiveTimeout(old_receive_timeout); + was_reset = true; +} + } diff --git a/src/IO/TimeoutSetter.h b/src/IO/TimeoutSetter.h index 31c37ea07af..3479986d7fe 100644 --- a/src/IO/TimeoutSetter.h +++ b/src/IO/TimeoutSetter.h @@ -6,7 +6,7 @@ namespace DB { -/// Temporarily overrides socket send/receive timeouts and reset them back into destructor +/// Temporarily overrides socket send/receive timeouts and reset them back into destructor (or manually by calling reset method) /// If "limit_max_timeout" is true, timeouts could be only decreased (maxed by previous value). struct TimeoutSetter { @@ -19,6 +19,9 @@ struct TimeoutSetter ~TimeoutSetter(); + /// Reset timeouts back. + void reset(); + Poco::Net::StreamSocket & socket; Poco::Timespan send_timeout; @@ -26,5 +29,6 @@ struct TimeoutSetter Poco::Timespan old_send_timeout; Poco::Timespan old_receive_timeout; + bool was_reset = false; }; } diff --git a/tests/integration/test_reload_clusters_config/test.py b/tests/integration/test_reload_clusters_config/test.py index a52871890e9..73ca4a01f34 100644 --- a/tests/integration/test_reload_clusters_config/test.py +++ b/tests/integration/test_reload_clusters_config/test.py @@ -169,7 +169,7 @@ test_config3 = """ def send_repeated_query(table, count=5): for i in range(count): node.query_and_get_error( - "SELECT count() FROM {} SETTINGS receive_timeout=1".format(table) + "SELECT count() FROM {} SETTINGS receive_timeout=1, handshake_timeout_ms=1".format(table) ) diff --git a/tests/integration/test_system_clusters_actual_information/test.py b/tests/integration/test_system_clusters_actual_information/test.py index 0658d0c7576..e90a6cdeb3f 100644 --- a/tests/integration/test_system_clusters_actual_information/test.py +++ b/tests/integration/test_system_clusters_actual_information/test.py @@ -40,8 +40,8 @@ def test(started_cluster): cluster.pause_container("node_1") node.query("SYSTEM RELOAD CONFIG") - node.query_and_get_error( - "SELECT count() FROM distributed SETTINGS receive_timeout=1" + error = node.query_and_get_error( + "SELECT count() FROM distributed SETTINGS receive_timeout=1, handshake_timeout_ms=1" ) result = node.query( From b82ff979d014ef63f5661f83c1cf29309fe340be Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Tue, 23 May 2023 23:10:34 +0800 Subject: [PATCH 0113/1072] Fix invalid index analysis for date related keys --- src/Storages/MergeTree/KeyCondition.cpp | 35 +++++++++++++++++-- .../02764_index_analysis_fix.reference | 1 + .../0_stateless/02764_index_analysis_fix.sql | 9 +++++ 3 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/02764_index_analysis_fix.reference create mode 100644 tests/queries/0_stateless/02764_index_analysis_fix.sql diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index b8ef2152a99..dea2091f115 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -943,6 +943,19 @@ static FieldRef applyFunction(const FunctionBasePtr & func, const DataTypePtr & return {field.columns, field.row_idx, result_idx}; } +static std::set date_time_parsing_functions = { + "toDate", + "toDate32", + "toDateTime", + "toDateTime64", + "ParseDateTimeBestEffort", + "ParseDateTimeBestEffortUS", + "ParseDateTime32BestEffort", + "ParseDateTime64BestEffort", + "parseDateTime", + "parseDateTimeInJodaSyntax", +}; + /** The key functional expression constraint may be inferred from a plain column in the expression. * For example, if the key contains `toStartOfHour(Timestamp)` and query contains `WHERE Timestamp >= now()`, * it can be assumed that if `toStartOfHour()` is monotonic on [now(), inf), the `toStartOfHour(Timestamp) >= toStartOfHour(now())` @@ -1026,10 +1039,23 @@ bool KeyCondition::transformConstantWithValidFunctions( if (func->type != ActionsDAG::ActionType::FUNCTION) continue; + const auto & func_name = func->function_base->getName(); + auto func_base = func->function_base; + if (date_time_parsing_functions.contains(func_name)) + { + auto func_or_null = FunctionFactory::instance().get(func_name + "OrNull", context); + ColumnsWithTypeAndName arguments; + int i = 0; + for (const auto & type : func->function_base->getArgumentTypes()) + arguments.push_back({nullptr, type, fmt::format("_{}", i++)}); + + func_base = func_or_null->build(arguments); + } + if (func->children.size() == 1) { std::tie(const_value, const_type) - = applyFunctionForFieldOfUnknownType(func->function_base, const_type, const_value); + = applyFunctionForFieldOfUnknownType(func_base, const_type, const_value); } else if (func->children.size() == 2) { @@ -1040,7 +1066,7 @@ bool KeyCondition::transformConstantWithValidFunctions( auto left_arg_type = left->result_type; auto left_arg_value = (*left->column)[0]; std::tie(const_value, const_type) = applyBinaryFunctionForFieldOfUnknownType( - FunctionFactory::instance().get(func->function_base->getName(), context), + FunctionFactory::instance().get(func_base->getName(), context), left_arg_type, left_arg_value, const_type, const_value); } else @@ -1048,10 +1074,13 @@ bool KeyCondition::transformConstantWithValidFunctions( auto right_arg_type = right->result_type; auto right_arg_value = (*right->column)[0]; std::tie(const_value, const_type) = applyBinaryFunctionForFieldOfUnknownType( - FunctionFactory::instance().get(func->function_base->getName(), context), + FunctionFactory::instance().get(func_base->getName(), context), const_type, const_value, right_arg_type, right_arg_value); } } + + if (const_value.isNull()) + return false; } out_key_column_num = it->second; diff --git a/tests/queries/0_stateless/02764_index_analysis_fix.reference b/tests/queries/0_stateless/02764_index_analysis_fix.reference new file mode 100644 index 00000000000..8eeacf99fa8 --- /dev/null +++ b/tests/queries/0_stateless/02764_index_analysis_fix.reference @@ -0,0 +1 @@ +2022-10-01 10:10:10 diff --git a/tests/queries/0_stateless/02764_index_analysis_fix.sql b/tests/queries/0_stateless/02764_index_analysis_fix.sql new file mode 100644 index 00000000000..541a3444ef3 --- /dev/null +++ b/tests/queries/0_stateless/02764_index_analysis_fix.sql @@ -0,0 +1,9 @@ +drop table if exists x; + +create table x (dt String) engine MergeTree partition by toYYYYMM(toDate(dt)) order by tuple(); + +insert into x values ('2022-10-01 10:10:10'); + +select * from x where dt like '2022-10-01%'; + +drop table x; From f3b4959e059640a9b786f421b3fe42f9a1fb4be6 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 23 May 2023 19:37:35 +0200 Subject: [PATCH 0114/1072] fix --- src/Storages/StorageReplicatedMergeTree.cpp | 7 +++++-- tests/integration/test_merge_tree_empty_parts/test.py | 2 +- .../queries/0_stateless/02448_clone_replica_lost_part.sql | 1 + 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index a6152c22148..fc90ff550c7 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -3393,10 +3393,13 @@ void StorageReplicatedMergeTree::mergeSelectingTask() } + Float32 new_sleep_ms = merge_selecting_sleep_ms; if (result == AttemptStatus::EntryCreated || result == AttemptStatus::NeedRetry) - merge_selecting_sleep_ms = static_cast(merge_selecting_sleep_ms / storage_settings_ptr->merge_selecting_sleep_slowdown_factor); + new_sleep_ms /= storage_settings_ptr->merge_selecting_sleep_slowdown_factor; else if (result == AttemptStatus::CannotSelect) - merge_selecting_sleep_ms = static_cast(merge_selecting_sleep_ms * storage_settings_ptr->merge_selecting_sleep_slowdown_factor); + new_sleep_ms *= storage_settings_ptr->merge_selecting_sleep_slowdown_factor; + new_sleep_ms *= std::uniform_real_distribution(1.f, 1.1f)(thread_local_rng); + merge_selecting_sleep_ms = static_cast(new_sleep_ms); if (merge_selecting_sleep_ms < storage_settings_ptr->merge_selecting_sleep_ms) merge_selecting_sleep_ms = storage_settings_ptr->merge_selecting_sleep_ms; diff --git a/tests/integration/test_merge_tree_empty_parts/test.py b/tests/integration/test_merge_tree_empty_parts/test.py index 212c0577c13..c6a96f3ed1b 100644 --- a/tests/integration/test_merge_tree_empty_parts/test.py +++ b/tests/integration/test_merge_tree_empty_parts/test.py @@ -48,7 +48,7 @@ def test_empty_parts_summing(started_cluster): "CREATE TABLE empty_parts_summing (d Date, key UInt64, value Int64) " "ENGINE = ReplicatedSummingMergeTree('/clickhouse/tables/empty_parts_summing', 'r1') " "PARTITION BY toYYYYMM(d) ORDER BY key " - "SETTINGS old_parts_lifetime = 1" + "SETTINGS old_parts_lifetime = 1, cleanup_delay_period=0, cleanup_thread_preferred_points_per_iteration=0" ) node1.query("INSERT INTO empty_parts_summing VALUES (toDate('2020-10-10'), 1, 1)") diff --git a/tests/queries/0_stateless/02448_clone_replica_lost_part.sql b/tests/queries/0_stateless/02448_clone_replica_lost_part.sql index 44303a1c532..7ad25d75fbe 100644 --- a/tests/queries/0_stateless/02448_clone_replica_lost_part.sql +++ b/tests/queries/0_stateless/02448_clone_replica_lost_part.sql @@ -144,6 +144,7 @@ select sleep(2) format Null; -- increases probability of reproducing the issue -- rmt1 will mimic rmt2, but will not be able to fetch parts for a while system stop replicated sends rmt2; attach table rmt1; +system sync replica rmt1; -- rmt1 should not show the value (200) from dropped part select throwIf(n = 200) from rmt1 format Null; select 11, arraySort(groupArray(n)) from rmt2; From be49281044eba2be91c46666ce12a28da446585c Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Wed, 24 May 2023 00:48:09 +0000 Subject: [PATCH 0115/1072] Try to fix test --- .../test/integration/runner/compose/docker_compose_mongo.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/test/integration/runner/compose/docker_compose_mongo.yml b/docker/test/integration/runner/compose/docker_compose_mongo.yml index 8cdcbc421e8..9a6eae6ca8c 100644 --- a/docker/test/integration/runner/compose/docker_compose_mongo.yml +++ b/docker/test/integration/runner/compose/docker_compose_mongo.yml @@ -1,7 +1,7 @@ version: '2.3' services: mongo1: - image: mongo:6.0 + image: mongo:5.0 restart: always environment: MONGO_INITDB_ROOT_USERNAME: root @@ -11,7 +11,7 @@ services: command: --profile=2 --verbose mongo2: - image: mongo:6.0 + image: mongo:5.0 restart: always ports: - ${MONGO_NO_CRED_EXTERNAL_PORT:-27017}:${MONGO_NO_CRED_INTERNAL_PORT:-27017} From b11aa42db9337ea652f8a0b36c9e3f21e9f675af Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Wed, 24 May 2023 14:27:49 +0800 Subject: [PATCH 0116/1072] Fix tests --- src/Storages/MergeTree/KeyCondition.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index dea2091f115..923e5237420 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -1041,7 +1041,8 @@ bool KeyCondition::transformConstantWithValidFunctions( const auto & func_name = func->function_base->getName(); auto func_base = func->function_base; - if (date_time_parsing_functions.contains(func_name)) + const auto & arg_types = func_base->getArgumentTypes(); + if (date_time_parsing_functions.contains(func_name) && !arg_types.empty() && isStringOrFixedString(arg_types[0])) { auto func_or_null = FunctionFactory::instance().get(func_name + "OrNull", context); ColumnsWithTypeAndName arguments; From 79c5aa23585efb20d410dccd8036af968525a71b Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Wed, 24 May 2023 06:52:22 +0000 Subject: [PATCH 0117/1072] Remove test from broken_tests.txt --- tests/broken_tests.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index cef8f68b210..e61c1316e17 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -137,4 +137,3 @@ 01600_parts_types_metrics_long 01287_max_execution_speed 02703_row_policy_for_database -02732_rename_after_processing From 8bbfdcc56c1ad77729529b1bcbb65d4a5b7c2b6d Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Wed, 24 May 2023 15:47:38 +0800 Subject: [PATCH 0118/1072] Fix index analysis with binary operator null --- src/Storages/MergeTree/KeyCondition.cpp | 4 ++++ ...ndex_analysis_binary_operator_with_null.reference | 0 ...2746_index_analysis_binary_operator_with_null.sql | 12 ++++++++++++ 3 files changed, 16 insertions(+) create mode 100644 tests/queries/0_stateless/02746_index_analysis_binary_operator_with_null.reference create mode 100644 tests/queries/0_stateless/02746_index_analysis_binary_operator_with_null.sql diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index b8ef2152a99..239a534ca93 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -1334,6 +1334,10 @@ bool KeyCondition::isKeyPossiblyWrappedByMonotonicFunctions( arguments.push_back(const_arg); kind = FunctionWithOptionalConstArg::Kind::RIGHT_CONST; } + + /// If constant arg of binary operator is NULL, there will be no monotonicity. + if (const_arg.column->isNullAt(0)) + return false; } else arguments.push_back({ nullptr, key_column_type, "" }); diff --git a/tests/queries/0_stateless/02746_index_analysis_binary_operator_with_null.reference b/tests/queries/0_stateless/02746_index_analysis_binary_operator_with_null.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02746_index_analysis_binary_operator_with_null.sql b/tests/queries/0_stateless/02746_index_analysis_binary_operator_with_null.sql new file mode 100644 index 00000000000..f9613735bbf --- /dev/null +++ b/tests/queries/0_stateless/02746_index_analysis_binary_operator_with_null.sql @@ -0,0 +1,12 @@ +drop table if exists tab; + +create table tab (x DateTime) engine MergeTree order by x; + +SELECT toDateTime(65537, toDateTime(NULL), NULL) +FROM tab +WHERE ((x + CAST('1', 'Nullable(UInt8)')) <= 2) AND ((x + CAST('', 'Nullable(UInt8)')) <= 256) +ORDER BY + toDateTime(toDateTime(-2, NULL, NULL) + 100.0001, NULL, -2, NULL) DESC NULLS LAST, + x ASC NULLS LAST; + +drop table tab; From 8bc25c4ea3a1359affc36599bcc982b741ea5360 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 24 May 2023 14:40:27 +0200 Subject: [PATCH 0119/1072] Fix style --- tests/integration/test_reload_clusters_config/test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_reload_clusters_config/test.py b/tests/integration/test_reload_clusters_config/test.py index 73ca4a01f34..cb003bbe04e 100644 --- a/tests/integration/test_reload_clusters_config/test.py +++ b/tests/integration/test_reload_clusters_config/test.py @@ -169,7 +169,9 @@ test_config3 = """ def send_repeated_query(table, count=5): for i in range(count): node.query_and_get_error( - "SELECT count() FROM {} SETTINGS receive_timeout=1, handshake_timeout_ms=1".format(table) + "SELECT count() FROM {} SETTINGS receive_timeout=1, handshake_timeout_ms=1".format( + table + ) ) From 242c3bc9a971b1f9b76df57b7df1ac5d176fe274 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 24 May 2023 16:01:28 +0200 Subject: [PATCH 0120/1072] fix --- tests/integration/test_ttl_replicated/test.py | 22 ++++++++----------- .../02448_clone_replica_lost_part.sql | 7 +++--- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/tests/integration/test_ttl_replicated/test.py b/tests/integration/test_ttl_replicated/test.py index 4ea4472b812..d78c00a9f9c 100644 --- a/tests/integration/test_ttl_replicated/test.py +++ b/tests/integration/test_ttl_replicated/test.py @@ -6,6 +6,7 @@ from helpers.cluster import ClickHouseCluster from helpers.test_tools import TSV, exec_query_with_retry from helpers.wait_for_helpers import wait_for_delete_inactive_parts from helpers.wait_for_helpers import wait_for_delete_empty_parts +from helpers.test_tools import assert_eq_with_retry cluster = ClickHouseCluster(__file__) node1 = cluster.add_instance("node1", with_zookeeper=True) @@ -66,7 +67,8 @@ def test_ttl_columns(started_cluster): """ CREATE TABLE test_ttl(date DateTime, id UInt32, a Int32 TTL date + INTERVAL 1 DAY, b Int32 TTL date + INTERVAL 1 MONTH) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/test_ttl_columns', '{replica}') - ORDER BY id PARTITION BY toDayOfMonth(date) SETTINGS merge_with_ttl_timeout=0, min_bytes_for_wide_part=0; + ORDER BY id PARTITION BY toDayOfMonth(date) + SETTINGS merge_with_ttl_timeout=0, min_bytes_for_wide_part=0, , max_merge_selecting_sleep_ms=6000; """.format( replica=node.name ) @@ -99,7 +101,7 @@ def test_merge_with_ttl_timeout(started_cluster): CREATE TABLE {table}(date DateTime, id UInt32, a Int32 TTL date + INTERVAL 1 DAY, b Int32 TTL date + INTERVAL 1 MONTH) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/{table}', '{replica}') ORDER BY id PARTITION BY toDayOfMonth(date) - SETTINGS min_bytes_for_wide_part=0; + SETTINGS min_bytes_for_wide_part=0, max_merge_selecting_sleep_ms=6000; """.format( replica=node.name, table=table ) @@ -134,14 +136,8 @@ def test_merge_with_ttl_timeout(started_cluster): ) ) - time.sleep(15) # TTL merges shall not happen. - - assert ( - node1.query("SELECT countIf(a = 0) FROM {table}".format(table=table)) == "3\n" - ) - assert ( - node2.query("SELECT countIf(a = 0) FROM {table}".format(table=table)) == "3\n" - ) + assert_eq_with_retry(node1, "SELECT countIf(a = 0) FROM {table}".format(table=table), "3\n") + assert_eq_with_retry(node2, "SELECT countIf(a = 0) FROM {table}".format(table=table), "3\n") def test_ttl_many_columns(started_cluster): @@ -155,7 +151,7 @@ def test_ttl_many_columns(started_cluster): _offset Int32 TTL date, _partition Int32 TTL date) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/test_ttl_2', '{replica}') - ORDER BY id PARTITION BY toDayOfMonth(date) SETTINGS merge_with_ttl_timeout=0; + ORDER BY id PARTITION BY toDayOfMonth(date) SETTINGS merge_with_ttl_timeout=0, max_merge_selecting_sleep_ms=6000; """.format( replica=node.name ) @@ -213,7 +209,7 @@ def test_ttl_table(started_cluster, delete_suffix): CREATE TABLE test_ttl(date DateTime, id UInt32) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/test_ttl', '{replica}') ORDER BY id PARTITION BY toDayOfMonth(date) - TTL date + INTERVAL 1 DAY {delete_suffix} SETTINGS merge_with_ttl_timeout=0; + TTL date + INTERVAL 1 DAY {delete_suffix} SETTINGS merge_with_ttl_timeout=0, max_merge_selecting_sleep_ms=6000; """.format( replica=node.name, delete_suffix=delete_suffix ) @@ -304,7 +300,7 @@ def test_ttl_double_delete_rule_returns_error(started_cluster): CREATE TABLE test_ttl(date DateTime, id UInt32) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/test_ttl_double_delete', '{replica}') ORDER BY id PARTITION BY toDayOfMonth(date) - TTL date + INTERVAL 1 DAY, date + INTERVAL 2 DAY SETTINGS merge_with_ttl_timeout=0 + TTL date + INTERVAL 1 DAY, date + INTERVAL 2 DAY SETTINGS merge_with_ttl_timeout=0, max_merge_selecting_sleep_ms=6000 """.format( replica=node1.name ) diff --git a/tests/queries/0_stateless/02448_clone_replica_lost_part.sql b/tests/queries/0_stateless/02448_clone_replica_lost_part.sql index 7ad25d75fbe..1e99e1869cc 100644 --- a/tests/queries/0_stateless/02448_clone_replica_lost_part.sql +++ b/tests/queries/0_stateless/02448_clone_replica_lost_part.sql @@ -6,10 +6,12 @@ drop table if exists rmt1; drop table if exists rmt2; create table rmt1 (n int) engine=ReplicatedMergeTree('/test/02448/{database}/rmt', '1') order by tuple() settings min_replicated_logs_to_keep=1, max_replicated_logs_to_keep=2, cleanup_delay_period=0, cleanup_delay_period_random_add=1, - cleanup_thread_preferred_points_per_iteration=0, old_parts_lifetime=0, max_parts_to_merge_at_once=4; + cleanup_thread_preferred_points_per_iteration=0, old_parts_lifetime=0, max_parts_to_merge_at_once=4, + merge_selecting_sleep_ms=100, max_merge_selecting_sleep_ms=500; create table rmt2 (n int) engine=ReplicatedMergeTree('/test/02448/{database}/rmt', '2') order by tuple() settings min_replicated_logs_to_keep=1, max_replicated_logs_to_keep=2, cleanup_delay_period=0, cleanup_delay_period_random_add=1, - cleanup_thread_preferred_points_per_iteration=0, old_parts_lifetime=0, max_parts_to_merge_at_once=4; + cleanup_thread_preferred_points_per_iteration=0, old_parts_lifetime=0, max_parts_to_merge_at_once=4, + merge_selecting_sleep_ms=100, max_merge_selecting_sleep_ms=500; -- insert part only on one replica system stop replicated sends rmt1; @@ -144,7 +146,6 @@ select sleep(2) format Null; -- increases probability of reproducing the issue -- rmt1 will mimic rmt2, but will not be able to fetch parts for a while system stop replicated sends rmt2; attach table rmt1; -system sync replica rmt1; -- rmt1 should not show the value (200) from dropped part select throwIf(n = 200) from rmt1 format Null; select 11, arraySort(groupArray(n)) from rmt2; From a237b8b83958abbb6976fdb72f67790c54442195 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Wed, 24 May 2023 14:19:37 +0000 Subject: [PATCH 0121/1072] Automatic style fix --- tests/integration/test_ttl_replicated/test.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_ttl_replicated/test.py b/tests/integration/test_ttl_replicated/test.py index d78c00a9f9c..d681e81df3a 100644 --- a/tests/integration/test_ttl_replicated/test.py +++ b/tests/integration/test_ttl_replicated/test.py @@ -136,8 +136,12 @@ def test_merge_with_ttl_timeout(started_cluster): ) ) - assert_eq_with_retry(node1, "SELECT countIf(a = 0) FROM {table}".format(table=table), "3\n") - assert_eq_with_retry(node2, "SELECT countIf(a = 0) FROM {table}".format(table=table), "3\n") + assert_eq_with_retry( + node1, "SELECT countIf(a = 0) FROM {table}".format(table=table), "3\n" + ) + assert_eq_with_retry( + node2, "SELECT countIf(a = 0) FROM {table}".format(table=table), "3\n" + ) def test_ttl_many_columns(started_cluster): From bc527c75889b321a01c30f665eb0d4ef47e61d68 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 24 May 2023 17:07:31 +0000 Subject: [PATCH 0122/1072] Don't send head request for all keys in Iceberg schema inference --- src/Storages/StorageS3.cpp | 19 ++++++++++++++----- src/Storages/StorageS3.h | 2 ++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index afaafcc75a2..f3cad4de31a 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -418,11 +418,13 @@ public: ASTPtr query_, const Block & virtual_header_, ContextPtr context_, + bool need_total_size, KeysWithInfo * read_keys_) : WithContext(context_) , bucket(bucket_) , query(query_) , virtual_header(virtual_header_) + { Strings all_keys = keys_; @@ -458,8 +460,13 @@ public: for (auto && key : all_keys) { - auto info = S3::getObjectInfo(client_, bucket, key, version_id_, request_settings_); - total_size += info.size; + std::optional info; + if (need_total_size) + { + info = S3::getObjectInfo(client_, bucket, key, version_id_, request_settings_); + total_size += info->size; + } + keys.emplace_back(std::move(key), std::move(info)); } @@ -501,10 +508,11 @@ StorageS3Source::KeysIterator::KeysIterator( ASTPtr query, const Block & virtual_header, ContextPtr context, + bool need_total_size, KeysWithInfo * read_keys) : pimpl(std::make_shared( client_, version_id_, keys_, bucket_, request_settings_, - query, virtual_header, context, read_keys)) + query, virtual_header, context, need_total_size, read_keys)) { } @@ -979,6 +987,7 @@ std::shared_ptr StorageS3::createFileIterator( ContextPtr local_context, ASTPtr query, const Block & virtual_block, + bool need_total_size, KeysWithInfo * read_keys) { if (distributed_processing) @@ -997,7 +1006,7 @@ std::shared_ptr StorageS3::createFileIterator( return std::make_shared( *configuration.client, configuration.url.version_id, configuration.keys, configuration.url.bucket, configuration.request_settings, query, - virtual_block, local_context, read_keys); + virtual_block, local_context, need_total_size, read_keys); } } @@ -1442,7 +1451,7 @@ ColumnsDescription StorageS3::getTableStructureFromDataImpl( { KeysWithInfo read_keys; - auto file_iterator = createFileIterator(configuration, false, ctx, nullptr, {}, &read_keys); + auto file_iterator = createFileIterator(configuration, false, ctx, nullptr, {}, false, &read_keys); std::optional columns_from_cache; size_t prev_read_keys_size = read_keys.size(); diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 12573ab513f..1ca8f80e7a0 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -95,6 +95,7 @@ public: ASTPtr query, const Block & virtual_header, ContextPtr context, + bool need_total_size = true, KeysWithInfo * read_keys = nullptr); KeyWithInfo next() override; @@ -354,6 +355,7 @@ private: ContextPtr local_context, ASTPtr query, const Block & virtual_block, + bool need_total_size = true, KeysWithInfo * read_keys = nullptr); static ColumnsDescription getTableStructureFromDataImpl( From 4c94b3d6bce6bf34a52e83f98b6fec312e4ba79b Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 24 May 2023 20:13:37 +0300 Subject: [PATCH 0123/1072] Update test.py --- tests/integration/test_ttl_replicated/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_ttl_replicated/test.py b/tests/integration/test_ttl_replicated/test.py index d681e81df3a..7ba5a4359c7 100644 --- a/tests/integration/test_ttl_replicated/test.py +++ b/tests/integration/test_ttl_replicated/test.py @@ -68,7 +68,7 @@ def test_ttl_columns(started_cluster): CREATE TABLE test_ttl(date DateTime, id UInt32, a Int32 TTL date + INTERVAL 1 DAY, b Int32 TTL date + INTERVAL 1 MONTH) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/test_ttl_columns', '{replica}') ORDER BY id PARTITION BY toDayOfMonth(date) - SETTINGS merge_with_ttl_timeout=0, min_bytes_for_wide_part=0, , max_merge_selecting_sleep_ms=6000; + SETTINGS merge_with_ttl_timeout=0, min_bytes_for_wide_part=0, max_merge_selecting_sleep_ms=6000; """.format( replica=node.name ) From e66f6272d1dc76859251fde165b2d2d9664dce8f Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 10 May 2023 18:39:38 +0000 Subject: [PATCH 0124/1072] Refactor CapnProto format to improve input/output performance --- src/Core/Settings.h | 2 +- src/Core/SettingsEnums.cpp | 8 +- src/Core/SettingsEnums.h | 2 +- src/Formats/CapnProtoSchema.cpp | 298 ++++ .../{CapnProtoUtils.h => CapnProtoSchema.h} | 13 +- src/Formats/CapnProtoSerializer.cpp | 1218 +++++++++++++++++ src/Formats/CapnProtoSerializer.h | 25 + src/Formats/CapnProtoUtils.cpp | 734 ---------- src/Formats/FormatSettings.h | 6 +- .../Formats/Impl/CapnProtoRowInputFormat.cpp | 253 +--- .../Formats/Impl/CapnProtoRowInputFormat.h | 9 +- .../Formats/Impl/CapnProtoRowOutputFormat.cpp | 266 +--- .../Formats/Impl/CapnProtoRowOutputFormat.h | 17 +- .../Formats/Impl/ProtobufListInputFormat.cpp | 9 +- .../Formats/Impl/ProtobufRowInputFormat.cpp | 9 +- .../queries/0_stateless/02030_capnp_format.sh | 4 +- ...p_case_insensitive_names_matcing.reference | 1 + ...35_capnp_case_insensitive_names_matcing.sh | 10 + ...ing_and_writing_structure_fields.reference | 3 + ...36_reading_and_writing_structure_fields.sh | 24 + ...2735_case_insensitive_names_matching.capnp | 13 + .../02736_nested_structures.capnp | 21 + 22 files changed, 1686 insertions(+), 1259 deletions(-) create mode 100644 src/Formats/CapnProtoSchema.cpp rename src/Formats/{CapnProtoUtils.h => CapnProtoSchema.h} (59%) create mode 100644 src/Formats/CapnProtoSerializer.cpp create mode 100644 src/Formats/CapnProtoSerializer.h delete mode 100644 src/Formats/CapnProtoUtils.cpp create mode 100644 tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.reference create mode 100755 tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.sh create mode 100644 tests/queries/0_stateless/02736_reading_and_writing_structure_fields.reference create mode 100755 tests/queries/0_stateless/02736_reading_and_writing_structure_fields.sh create mode 100644 tests/queries/0_stateless/format_schemas/02735_case_insensitive_names_matching.capnp create mode 100644 tests/queries/0_stateless/format_schemas/02736_nested_structures.capnp diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 1df0a8af24f..2863cc9d7a7 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -962,7 +962,7 @@ class IColumn; M(Bool, output_format_orc_string_as_string, false, "Use ORC String type instead of Binary for String columns", 0) \ M(ORCCompression, output_format_orc_compression_method, "lz4", "Compression method for ORC output format. Supported codecs: lz4, snappy, zlib, zstd, none (uncompressed)", 0) \ \ - M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \ + M(CapnProtoEnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::CapnProtoEnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \ \ M(String, input_format_mysql_dump_table_name, "", "Name of the table in MySQL dump from which to read data", 0) \ M(Bool, input_format_mysql_dump_map_column_names, true, "Match columns from table in MySQL dump and columns from ClickHouse table by names", 0) \ diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp index e0f16ea00db..a291a23c140 100644 --- a/src/Core/SettingsEnums.cpp +++ b/src/Core/SettingsEnums.cpp @@ -144,10 +144,10 @@ IMPLEMENT_SETTING_ENUM(TransactionsWaitCSNMode, ErrorCodes::BAD_ARGUMENTS, {"wait", TransactionsWaitCSNMode::WAIT}, {"wait_unknown", TransactionsWaitCSNMode::WAIT_UNKNOWN}}) -IMPLEMENT_SETTING_ENUM(EnumComparingMode, ErrorCodes::BAD_ARGUMENTS, - {{"by_names", FormatSettings::EnumComparingMode::BY_NAMES}, - {"by_values", FormatSettings::EnumComparingMode::BY_VALUES}, - {"by_names_case_insensitive", FormatSettings::EnumComparingMode::BY_NAMES_CASE_INSENSITIVE}}) +IMPLEMENT_SETTING_ENUM(CapnProtoEnumComparingMode, ErrorCodes::BAD_ARGUMENTS, + {{"by_names", FormatSettings::CapnProtoEnumComparingMode::BY_NAMES}, + {"by_values", FormatSettings::CapnProtoEnumComparingMode::BY_VALUES}, + {"by_names_case_insensitive", FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE}}) IMPLEMENT_SETTING_ENUM(EscapingRule, ErrorCodes::BAD_ARGUMENTS, {{"None", FormatSettings::EscapingRule::None}, diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h index 3ae7bfaa673..1c5be910ef7 100644 --- a/src/Core/SettingsEnums.h +++ b/src/Core/SettingsEnums.h @@ -188,7 +188,7 @@ enum class TransactionsWaitCSNMode DECLARE_SETTING_ENUM(TransactionsWaitCSNMode) -DECLARE_SETTING_ENUM_WITH_RENAME(EnumComparingMode, FormatSettings::EnumComparingMode) +DECLARE_SETTING_ENUM_WITH_RENAME(CapnProtoEnumComparingMode, FormatSettings::CapnProtoEnumComparingMode) DECLARE_SETTING_ENUM_WITH_RENAME(EscapingRule, FormatSettings::EscapingRule) diff --git a/src/Formats/CapnProtoSchema.cpp b/src/Formats/CapnProtoSchema.cpp new file mode 100644 index 00000000000..22518d5061a --- /dev/null +++ b/src/Formats/CapnProtoSchema.cpp @@ -0,0 +1,298 @@ +#include + +#if USE_CAPNP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_PARSE_CAPN_PROTO_SCHEMA; + extern const int BAD_TYPE_OF_FIELD; + extern const int FILE_DOESNT_EXIST; + extern const int UNKNOWN_EXCEPTION; + extern const int CAPN_PROTO_BAD_TYPE; + extern const int BAD_ARGUMENTS; +} + +capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaInfo & schema_info) +{ + capnp::ParsedSchema schema; + try + { + int fd; + KJ_SYSCALL(fd = open(schema_info.schemaDirectory().data(), O_RDONLY)); // NOLINT(bugprone-suspicious-semicolon) + auto schema_dir = kj::newDiskDirectory(kj::OsFileHandle(fd)); + schema = impl.parseFromDirectory(*schema_dir, kj::Path::parse(schema_info.schemaPath()), {}); + } + catch (const kj::Exception & e) + { + /// That's not good to determine the type of error by its description, but + /// this is the only way to do it here, because kj doesn't specify the type of error. + auto description = std::string_view(e.getDescription().cStr()); + if (description.find("No such file or directory") != String::npos || description.find("no such directory") != String::npos) + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot open CapnProto schema, file {} doesn't exists", schema_info.absoluteSchemaPath()); + + if (description.find("Parse error") != String::npos) + throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA, "Cannot parse CapnProto schema {}:{}", schema_info.schemaPath(), e.getLine()); + + throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, + "Unknown exception while parsing CapnProto schema: {}, schema dir and file: {}, {}", + description, schema_info.schemaDirectory(), schema_info.schemaPath()); + } + + auto message_maybe = schema.findNested(schema_info.messageName()); + auto * message_schema = kj::_::readMaybe(message_maybe); + if (!message_schema) + throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA, + "CapnProto schema doesn't contain message with name {}", schema_info.messageName()); + return message_schema->asStruct(); +} + +bool checkIfStructContainsUnnamedUnion(const capnp::StructSchema & struct_schema) +{ + return struct_schema.getFields().size() != struct_schema.getNonUnionFields().size(); +} + +bool checkIfStructIsNamedUnion(const capnp::StructSchema & struct_schema) +{ + return struct_schema.getFields().size() == struct_schema.getUnionFields().size(); +} + +/// Get full name of type for better exception messages. +String getCapnProtoFullTypeName(const capnp::Type & type) +{ + static const std::map capnp_simple_type_names = + { + {capnp::schema::Type::Which::BOOL, "Bool"}, + {capnp::schema::Type::Which::VOID, "Void"}, + {capnp::schema::Type::Which::INT8, "Int8"}, + {capnp::schema::Type::Which::INT16, "Int16"}, + {capnp::schema::Type::Which::INT32, "Int32"}, + {capnp::schema::Type::Which::INT64, "Int64"}, + {capnp::schema::Type::Which::UINT8, "UInt8"}, + {capnp::schema::Type::Which::UINT16, "UInt16"}, + {capnp::schema::Type::Which::UINT32, "UInt32"}, + {capnp::schema::Type::Which::UINT64, "UInt64"}, + {capnp::schema::Type::Which::FLOAT32, "Float32"}, + {capnp::schema::Type::Which::FLOAT64, "Float64"}, + {capnp::schema::Type::Which::TEXT, "Text"}, + {capnp::schema::Type::Which::DATA, "Data"}, + {capnp::schema::Type::Which::INTERFACE, "Interface"}, + {capnp::schema::Type::Which::ANY_POINTER, "AnyPointer"}, + }; + + switch (type.which()) + { + case capnp::schema::Type::Which::STRUCT: + { + auto struct_schema = type.asStruct(); + + auto non_union_fields = struct_schema.getNonUnionFields(); + std::vector non_union_field_names; + for (auto nested_field : non_union_fields) + non_union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType())); + + auto union_fields = struct_schema.getUnionFields(); + std::vector union_field_names; + for (auto nested_field : union_fields) + union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType())); + + String union_name = "Union(" + boost::algorithm::join(union_field_names, ", ") + ")"; + /// Check if the struct is a named union. + if (non_union_field_names.empty()) + return union_name; + + String type_name = "Struct(" + boost::algorithm::join(non_union_field_names, ", "); + /// Check if the struct contains unnamed union. + if (!union_field_names.empty()) + type_name += ", " + union_name; + type_name += ")"; + return type_name; + } + case capnp::schema::Type::Which::LIST: + return "List(" + getCapnProtoFullTypeName(type.asList().getElementType()) + ")"; + case capnp::schema::Type::Which::ENUM: + { + auto enum_schema = type.asEnum(); + String enum_name = "Enum("; + auto enumerants = enum_schema.getEnumerants(); + for (unsigned i = 0; i != enumerants.size(); ++i) + { + enum_name += String(enumerants[i].getProto().getName()) + " = " + std::to_string(enumerants[i].getOrdinal()); + if (i + 1 != enumerants.size()) + enum_name += ", "; + } + enum_name += ")"; + return enum_name; + } + default: + auto it = capnp_simple_type_names.find(type.which()); + if (it == capnp_simple_type_names.end()) + throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unknown CapnProto type"); + return it->second; + } +} + +namespace +{ + + template + static DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants) + { + std::vector> values; + for (auto enumerant : enumerants) + values.emplace_back(enumerant.getProto().getName(), ValueType(enumerant.getOrdinal())); + return std::make_shared>(std::move(values)); + } + + static DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema) + { + auto enumerants = enum_schema.getEnumerants(); + if (enumerants.size() < 128) + return getEnumDataTypeFromEnumerants(enumerants); + if (enumerants.size() < 32768) + return getEnumDataTypeFromEnumerants(enumerants); + + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "ClickHouse supports only 8 and 16-bit Enums"); + } + + static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type, bool skip_unsupported_fields) + { + switch (capnp_type.which()) + { + case capnp::schema::Type::INT8: + return std::make_shared(); + case capnp::schema::Type::INT16: + return std::make_shared(); + case capnp::schema::Type::INT32: + return std::make_shared(); + case capnp::schema::Type::INT64: + return std::make_shared(); + case capnp::schema::Type::BOOL: [[fallthrough]]; + case capnp::schema::Type::UINT8: + return std::make_shared(); + case capnp::schema::Type::UINT16: + return std::make_shared(); + case capnp::schema::Type::UINT32: + return std::make_shared(); + case capnp::schema::Type::UINT64: + return std::make_shared(); + case capnp::schema::Type::FLOAT32: + return std::make_shared(); + case capnp::schema::Type::FLOAT64: + return std::make_shared(); + case capnp::schema::Type::DATA: [[fallthrough]]; + case capnp::schema::Type::TEXT: + return std::make_shared(); + case capnp::schema::Type::ENUM: + return getEnumDataTypeFromEnumSchema(capnp_type.asEnum()); + case capnp::schema::Type::LIST: + { + auto list_schema = capnp_type.asList(); + auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType(), skip_unsupported_fields); + if (!nested_type) + return nullptr; + return std::make_shared(nested_type); + } + case capnp::schema::Type::STRUCT: + { + auto struct_schema = capnp_type.asStruct(); + + + if (struct_schema.getFields().size() == 0) + { + if (skip_unsupported_fields) + return nullptr; + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Empty messages are not supported"); + } + + /// Check if it can be Nullable. + if (checkIfStructIsNamedUnion(struct_schema)) + { + auto fields = struct_schema.getUnionFields(); + if (fields.size() != 2 || (!fields[0].getType().isVoid() && !fields[1].getType().isVoid())) + { + if (skip_unsupported_fields) + return nullptr; + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unions are not supported"); + } + auto value_type = fields[0].getType().isVoid() ? fields[1].getType() : fields[0].getType(); + if (value_type.isStruct() || value_type.isList()) + { + if (skip_unsupported_fields) + return nullptr; + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Tuples and Lists cannot be inside Nullable"); + } + + auto nested_type = getDataTypeFromCapnProtoType(value_type, skip_unsupported_fields); + if (!nested_type) + return nullptr; + return std::make_shared(nested_type); + } + + if (checkIfStructContainsUnnamedUnion(struct_schema)) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); + + /// Treat Struct as Tuple. + DataTypes nested_types; + Names nested_names; + for (auto field : struct_schema.getNonUnionFields()) + { + auto nested_type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields); + if (!nested_type) + continue; + nested_names.push_back(field.getProto().getName()); + nested_types.push_back(nested_type); + } + if (nested_types.empty()) + return nullptr; + return std::make_shared(std::move(nested_types), std::move(nested_names)); + } + default: + { + if (skip_unsupported_fields) + return nullptr; + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unsupported CapnProtoType: {}", getCapnProtoFullTypeName(capnp_type)); + } + } +} + +} + +NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields) +{ + if (checkIfStructContainsUnnamedUnion(schema)) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); + + NamesAndTypesList names_and_types; + for (auto field : schema.getNonUnionFields()) + { + auto name = field.getProto().getName(); + auto type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields); + if (type) + names_and_types.emplace_back(name, type); + } + if (names_and_types.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Cannot convert CapnProto schema to ClickHouse table schema, all fields have unsupported types"); + + return names_and_types; +} + +} + +#endif diff --git a/src/Formats/CapnProtoUtils.h b/src/Formats/CapnProtoSchema.h similarity index 59% rename from src/Formats/CapnProtoUtils.h rename to src/Formats/CapnProtoSchema.h index 2d8cdb418d7..225f6f56207 100644 --- a/src/Formats/CapnProtoUtils.h +++ b/src/Formats/CapnProtoSchema.h @@ -30,17 +30,14 @@ public: capnp::StructSchema getMessageSchema(const FormatSchemaInfo & schema_info); }; -std::pair splitCapnProtoFieldName(const String & name); +bool checkIfStructContainsUnnamedUnion(const capnp::StructSchema & struct_schema); +bool checkIfStructIsNamedUnion(const capnp::StructSchema & struct_schema); -bool compareEnumNames(const String & first, const String & second, FormatSettings::EnumComparingMode mode); - -std::pair getStructBuilderAndFieldByColumnName(capnp::DynamicStruct::Builder struct_builder, const String & name); - -capnp::DynamicValue::Reader getReaderByColumnName(const capnp::DynamicStruct::Reader & struct_reader, const String & name); - -void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Block & header, FormatSettings::EnumComparingMode mode); +/// Get full name of type for better exception messages. +String getCapnProtoFullTypeName(const capnp::Type & type); NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields); + } #endif diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp new file mode 100644 index 00000000000..e0c8ae2a79a --- /dev/null +++ b/src/Formats/CapnProtoSerializer.cpp @@ -0,0 +1,1218 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int THERE_IS_NO_COLUMN; + extern const int BAD_TYPE_OF_FIELD; + extern const int CAPN_PROTO_BAD_CAST; + extern const int INCORRECT_DATA; + extern const int ILLEGAL_COLUMN; +} + +namespace +{ + std::pair splitFieldName(const String & name) + { + const auto * begin = name.data(); + const auto * end = name.data() + name.size(); + const auto * it = find_first_symbols<'_', '.'>(begin, end); + String first = String(begin, it); + String second = it == end ? "" : String(it + 1, end); + return {first, second}; + } + + std::optional findFieldByName(const capnp::StructSchema & struct_schema, const String & name) + { + const auto & fields = struct_schema.getFields(); + for (auto field : fields) + { + auto field_name = String(field.getProto().getName()); + if (boost::to_lower_copy(name) == boost::to_lower_copy(field_name)) + return field; + } + return std::nullopt; + } + + [[noreturn]] void throwCannotConvert(const DataTypePtr & type, const String & name, const capnp::Type & capnp_type) + { + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\" with type {} to CapnProto type {}", + name, + type->getName(), + getCapnProtoFullTypeName(capnp_type)); + } + + struct FieldBuilder + { + virtual ~FieldBuilder() = default; + }; + + struct ListBuilder : public FieldBuilder + { + explicit ListBuilder(capnp::DynamicValue::Builder builder) : impl(builder.as()) + { + } + + capnp::DynamicList::Builder impl; + std::vector> nested_builders; + }; + + struct StructBuilder : public FieldBuilder + { + explicit StructBuilder(capnp::DynamicValue::Builder builder, size_t fields_size) : impl(builder.as()), field_builders(fields_size) + { + } + + explicit StructBuilder(capnp::DynamicStruct::Builder struct_builder, size_t fields_size) : impl(std::move(struct_builder)), field_builders(fields_size) + { + } + + capnp::DynamicStruct::Builder impl; + std::vector> field_builders; + }; + + std::unique_ptr initStructFieldBuilderIfNeeded(const ColumnPtr & column, size_t row_num, capnp::DynamicStruct::Builder & struct_builder, const capnp::StructSchema::Field & field, const capnp::Type & capnp_type, size_t nested_fields_size) + { + switch (capnp_type.which()) + { + case capnp::schema::Type::LIST: + { + const auto * array_column = assert_cast(column.get()); + size_t size = array_column->getOffsets()[row_num] - array_column->getOffsets()[row_num - 1]; + return std::make_unique(struct_builder.init(field, static_cast(size))); + } + case capnp::schema::Type::STRUCT: + { + return std::make_unique(struct_builder.init(field), nested_fields_size); + } + default: + return nullptr; + } + } + + class ICapnProtoSerializer + { + public: + virtual std::optional writeRow(const ColumnPtr & column, FieldBuilder * builder, size_t row_num) = 0; + virtual void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) = 0; + + virtual ~ICapnProtoSerializer() = default; + }; + + template + class CapnProtoIntegerSerializer : public ICapnProtoSerializer + { + public: + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::INT) + return capnp::DynamicValue::Reader(column->getInt(row_num)); + if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::UINT) + return capnp::DynamicValue::Reader(column->getUInt(row_num)); + return capnp::DynamicValue::Reader(column->getBool(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + NumericType value; + if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::INT) + value = static_cast(reader.as()); + else if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::UINT) + value = static_cast(reader.as()); + else if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::BOOL) + value = static_cast(reader.as()); + + if constexpr (is_bool_data_type) + assert_cast(column).insertValue(static_cast(value)); + else + assert_cast &>(column).insertValue(value); + } + }; + + template + static std::unique_ptr createIntegerSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + switch (capnp_type.which()) + { + case capnp::schema::Type::INT8: [[fallthrough]]; + case capnp::schema::Type::INT16: [[fallthrough]]; + case capnp::schema::Type::INT32: [[fallthrough]]; + case capnp::schema::Type::INT64: + return std::make_unique>(); + case capnp::schema::Type::UINT8: [[fallthrough]]; + case capnp::schema::Type::UINT16: [[fallthrough]]; + case capnp::schema::Type::UINT32: [[fallthrough]]; + case capnp::schema::Type::UINT64: + return std::make_unique>(); + case capnp::schema::Type::BOOL: + return std::make_unique>(); + default: + throwCannotConvert(data_type, column_name, capnp_type); + } + } + + template + class CapnProtoBigIntegerSerializer : public ICapnProtoSerializer + { + public: + CapnProtoBigIntegerSerializer(const DataTypePtr & data_type_, const String & column_name, const capnp::Type & capnp_type) : data_type(data_type_) + { + if (!capnp_type.isData()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto value = reader.as(); + if (value.size() != sizeof(NumericType)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", data_type->getName(), value.size()); + + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + + private: + DataTypePtr data_type; + }; + + template + class CapnProtoFloatSerializer : public ICapnProtoSerializer + { + public: + CapnProtoFloatSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isFloat32() && !capnp_type.isFloat64()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getFloat64(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast &>(column).insertValue(reader.as()); + } + }; + + template + class CapnProtoEnumSerializer : public ICapnProtoSerializer + { + public: + CapnProtoEnumSerializer( + const DataTypePtr & data_type_, + const String & column_name, + const capnp::Type & capnp_type, + const FormatSettings::CapnProtoEnumComparingMode enum_comparing_mode_) : data_type(data_type_), enum_comparing_mode(enum_comparing_mode_) + { + if (!capnp_type.isEnum()) + throwCannotConvert(data_type, column_name, capnp_type); + + bool to_lower = enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE; + const auto * enum_type = assert_cast *>(data_type.get()); + const auto & enum_values = dynamic_cast &>(*enum_type); + + enum_schema = capnp_type.asEnum(); + auto enumerants = enum_schema.getEnumerants(); + constexpr auto max_value = std::is_same_v ? INT8_MAX : INT16_MAX; + if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) + { + /// In CapnProto Enum fields are numbered sequentially starting from zero. + if (enumerants.size() > max_value) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Enum from CapnProto schema contains values that are out of range for Clickhouse enum type {}", + data_type->getName()); + + auto values = enum_values.getSetOfAllValues(); + std::unordered_set capn_enum_values; + for (auto enumerant : enumerants) + capn_enum_values.insert(EnumType(enumerant.getOrdinal())); + if (values != capn_enum_values) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "The set of values in Enum from CapnProto schema is different from the set of values in ClickHouse Enum"); + } + else + { + auto names = enum_values.getSetOfAllNames(to_lower); + std::unordered_set capn_enum_names; + + for (auto enumerant : enumerants) + { + String name = enumerant.getProto().getName(); + capn_enum_names.insert(to_lower ? boost::algorithm::to_lower_copy(name) : name); + } + + if (names != capn_enum_names) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "The set of names in Enum from CapnProto schema is different from the set of names in ClickHouse Enum"); + } + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + const auto * enum_data_type = assert_cast *>(data_type.get()); + EnumType enum_value = assert_cast &>(*column).getElement(row_num); + if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) + return capnp::DynamicValue::Reader(capnp::DynamicEnum(enum_schema, enum_value)); + + auto enum_name = enum_data_type->getNameForValue(enum_value); + for (const auto enumerant : enum_schema.getEnumerants()) + { + if (compareEnumNames(String(enum_name), enumerant.getProto().getName(), enum_comparing_mode)) + return capnp::DynamicValue::Reader(capnp::DynamicEnum(enumerant)); + } + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot convert CLickHouse Enum value to CapnProto Enum"); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto enum_value = reader.as(); + auto enumerant = *kj::_::readMaybe(enum_value.getEnumerant()); + auto enum_type = assert_cast *>(data_type.get()); + DataTypePtr nested_type = std::make_shared>(); + switch (enum_comparing_mode) + { + case FormatSettings::CapnProtoEnumComparingMode::BY_VALUES: + { + assert_cast &>(column).insertValue(static_cast(enumerant.getOrdinal())); + return; + } + case FormatSettings::CapnProtoEnumComparingMode::BY_NAMES: + { + auto value = enum_type->getValue(String(enumerant.getProto().getName())); + assert_cast &>(column).insertValue(value); + return; + } + case FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE: + { + /// Find the same enum name case insensitive. + String enum_name = enumerant.getProto().getName(); + for (auto & name : enum_type->getAllRegisteredNames()) + { + if (compareEnumNames(name, enum_name, enum_comparing_mode)) + { + assert_cast &>(column).insertValue(enum_type->getValue(name)); + break; + } + } + return; + } + } + } + + private: + bool compareEnumNames(const String & first, const String & second, const FormatSettings::CapnProtoEnumComparingMode mode) + { + if (mode == FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE) + return boost::algorithm::to_lower_copy(first) == boost::algorithm::to_lower_copy(second); + return first == second; + } + + DataTypePtr data_type; + capnp::EnumSchema enum_schema; + const FormatSettings::CapnProtoEnumComparingMode enum_comparing_mode; + }; + + class CapnProtoDateSerializer : public ICapnProtoSerializer + { + public: + CapnProtoDateSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isUInt16()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getUInt(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast(column).insertValue(reader.as()); + } + }; + + class CapnProtoDate32Serializer : public ICapnProtoSerializer + { + public: + CapnProtoDate32Serializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isInt32()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getInt(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast(column).insertValue(reader.as()); + } + }; + + class CapnProtoDateTimeSerializer : public ICapnProtoSerializer + { + public: + CapnProtoDateTimeSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isUInt32()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getInt(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast(column).insertValue(reader.as()); + } + }; + + class CapnProtoDateTime64Serializer : public ICapnProtoSerializer + { + public: + CapnProtoDateTime64Serializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isInt64()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getInt(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast(column).insertValue(reader.as()); + } + }; + + template + class CapnProtoDecimalSerializer : public ICapnProtoSerializer + { + public: + CapnProtoDecimalSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + auto which = WhichDataType(data_type); + if ((!capnp_type.isInt32() && which.isDecimal32()) || (!capnp_type.isInt64() && which.isDecimal64())) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(column->getInt(row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast &>(column).insertValue(reader.as()); + } + }; + + template + class CapnProtoBigDecimalSerializer : public ICapnProtoSerializer + { + public: + CapnProtoBigDecimalSerializer(const DataTypePtr & data_type_, const String & column_name, const capnp::Type & capnp_type) : data_type(data_type_) + { + if (!capnp_type.isData()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto value = reader.as(); + if (value.size() != sizeof(DecimalType)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", data_type->getName(), value.size()); + + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + + private: + DataTypePtr data_type; + }; + + template + class CapnProtoStringSerializer : public ICapnProtoSerializer + { + public: + CapnProtoStringSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type_) : capnp_type(capnp_type_) + { + if (!capnp_type.isData() && !capnp_type.isText()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + + if constexpr (is_binary) + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + + /// For type TEXT data must be null-terminated, but in String column we always have 0 byte at the end of each value. + return capnp::DynamicValue::Reader(capnp::Text::Reader(data.data, data.size)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + if constexpr (is_binary) + { + auto value = reader.as(); + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + else + { + auto value = reader.as(); + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + } + + private: + capnp::Type capnp_type; + }; + + template + class CapnProtoFixedStringSerializer : public ICapnProtoSerializer + { + public: + CapnProtoFixedStringSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type_) : capnp_type(capnp_type_) + { + if (!capnp_type.isData() && !capnp_type.isText()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + if constexpr (is_binary) + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + + if (data.data[data.size - 1] == 0) + return capnp::DynamicValue::Reader(capnp::Text::Reader(reinterpret_cast(data.data), data.size)); + + /// In TEXT type data should be null-terminated, but ClickHouse FixedString data could not be. + /// To make data null-terminated we should copy it to temporary String object and use it in capnp::Text::Reader. + /// Note that capnp::Text::Reader works only with pointer to the data and it's size, so we should + /// guarantee that new String object life time is longer than capnp::Text::Reader life time. + tmp_string = data.toString(); + return capnp::DynamicValue::Reader(capnp::Text::Reader(reinterpret_cast(tmp_string.data()), tmp_string.size())); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto & fixed_string_column = assert_cast(column); + if constexpr (is_binary) + { + auto value = reader.as(); + if (value.size() > fixed_string_column.getN()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot read data with size {} to FixedString with size {}", value.size(), fixed_string_column.getN()); + + fixed_string_column.insertData(reinterpret_cast(value.begin()), value.size()); + } + else + { + auto value = reader.as(); + if (value.size() > fixed_string_column.getN()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot read data with size {} to FixedString with size {}", value.size(), fixed_string_column.getN()); + + fixed_string_column.insertData(reinterpret_cast(value.begin()), value.size()); + } + } + + private: + String tmp_string; + capnp::Type capnp_type; + }; + + class CapnProtoIPv4Serializer : public ICapnProtoSerializer + { + public: + CapnProtoIPv4Serializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isUInt32()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + return capnp::DynamicValue::Reader(assert_cast(*column).getElement(row_num).toUnderType()); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + assert_cast(column).insertValue(IPv4(reader.as())); + } + }; + + class CapnProtoIPv6Serializer : public ICapnProtoSerializer + { + public: + CapnProtoIPv6Serializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isData()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto value = reader.as(); + if (value.size() != sizeof(IPv6)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of IPv6 value: {}", value.size()); + + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + }; + + class CapnProtoUUIDSerializer : public ICapnProtoSerializer + { + public: + CapnProtoUUIDSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isData()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + { + auto data = column->getDataAt(row_num); + return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto value = reader.as(); + if (value.size() != sizeof(UUID)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of UUID value: {}", value.size()); + + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + }; + + std::unique_ptr createSerializer(const DataTypePtr & type, const String & name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings); + + class CapnProtoLowCardinalitySerializer : public ICapnProtoSerializer + { + public: + CapnProtoLowCardinalitySerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + nested_serializer = createSerializer(assert_cast(*data_type).getDictionaryType(), column_name, capnp_type, settings); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder * field_builder, size_t row_num) override + { + const auto & low_cardinality_column = assert_cast(*column); + size_t index = low_cardinality_column.getIndexAt(row_num); + const auto & dict_column = low_cardinality_column.getDictionary().getNestedColumn(); + return nested_serializer->writeRow(dict_column, field_builder, index); + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto & low_cardinality_column = assert_cast(column); + auto tmp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); + nested_serializer->readRow(*tmp_column, reader); + low_cardinality_column.insertFromFullColumn(*tmp_column, 0); + } + + private: + std::unique_ptr nested_serializer; + }; + + class CapnProtoNullableSerializer : public ICapnProtoSerializer + { + public: + CapnProtoNullableSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + if (!capnp_type.isStruct()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert column \"{}\": Nullable can be represented only as a named union of type Void and nested type, got CapnProto type {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + + /// Check that struct is a named union of type VOID and one arbitrary type. + auto struct_schema = capnp_type.asStruct(); + if (!checkIfStructIsNamedUnion(struct_schema)) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert column \"{}\": Nullable can be represented only as a named union of type Void and nested type." + "Given CapnProto struct is not a named union: {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + + auto union_fields = struct_schema.getUnionFields(); + if (union_fields.size() != 2) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert column \"{}\": Nullable can be represented only as a named union of type Void and nested type." + "Given CapnProto union have more than 2 fields: {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + + auto first = union_fields[0]; + auto second = union_fields[1]; + auto nested_type = assert_cast(data_type.get())->getNestedType(); + if (first.getType().isVoid()) + { + null_field = first; + nested_field = second; + nested_capnp_type = second.getType(); + if (nested_capnp_type.isStruct()) + nested_fields_size = nested_capnp_type.asStruct().getFields().size(); + nested_serializer = createSerializer(nested_type, column_name, nested_capnp_type, settings); + } + else if (second.getType().isVoid()) + { + null_field = second; + nested_field = first; + nested_capnp_type = first.getType(); + if (nested_capnp_type.isStruct()) + nested_fields_size = nested_capnp_type.asStruct().getFields().size(); + nested_serializer = createSerializer(nested_type, column_name, nested_capnp_type, settings); + } + else + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert column \"{}\": Nullable can be represented only as a named union of type Void and nested type." + "Given CapnProto union doesn't have field with type Void: {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder * field_builder, size_t row_num) override + { + assert(field_builder); + auto & struct_builder = assert_cast(*field_builder); + const auto & nullable_column = assert_cast(*column); + if (nullable_column.isNullAt(row_num)) + { + struct_builder.impl.set(null_field, capnp::Void()); + } + else + { + struct_builder.impl.clear(nested_field); + const auto & nested_column = nullable_column.getNestedColumnPtr(); + auto nested_field_builder = initStructFieldBuilderIfNeeded(nested_column, row_num, struct_builder.impl, nested_field, nested_capnp_type, nested_fields_size); + auto value = nested_serializer->writeRow(nested_column, nested_field_builder.get(), row_num); + if (value) + struct_builder.impl.set(nested_field, *value); + } + + return std::nullopt; + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto struct_reader = reader.as(); + auto & nullable_column = assert_cast(column); + auto field = *kj::_::readMaybe(struct_reader.which()); + if (field.getType().isVoid()) + nullable_column.insertDefault(); + else + { + auto & nested_column = nullable_column.getNestedColumn(); + auto nested_reader = struct_reader.get(field); + nested_serializer->readRow(nested_column, nested_reader); + nullable_column.getNullMapData().push_back(0); + } + } + + private: + std::unique_ptr nested_serializer; + capnp::StructSchema::Field null_field; + capnp::StructSchema::Field nested_field; + size_t nested_fields_size = 0; + capnp::Type nested_capnp_type; + }; + + class CapnProtoArraySerializer : public ICapnProtoSerializer + { + public: + CapnProtoArraySerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + if (!capnp_type.isList()) + throwCannotConvert(data_type, column_name, capnp_type); + + auto nested_type = assert_cast(data_type.get())->getNestedType(); + element_type = capnp_type.asList().getElementType(); + if (element_type.isStruct()) + element_struct_fields = element_type.asStruct().getFields().size(); + nested_serializer = createSerializer(nested_type, column_name, capnp_type.asList().getElementType(), settings); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder * field_builder, size_t row_num) override + { + assert(field_builder); + auto & list_builder = assert_cast(*field_builder); + const auto * array_column = assert_cast(column.get()); + const auto & nested_column = array_column->getDataPtr(); + const auto & offsets = array_column->getOffsets(); + auto offset = offsets[row_num - 1]; + size_t size = offsets[row_num] - offset; + bool need_nested_builders = list_builder.nested_builders.empty(); + for (unsigned i = 0; i != static_cast(size); ++i) + { + if (need_nested_builders) + { + /// For nested lists we need to initialize nested list builder. + if (element_type.isList()) + { + const auto & nested_offset = checkAndGetColumn(*nested_column)->getOffsets(); + size_t nested_array_size = nested_offset[offset + i] - nested_offset[offset + i - 1]; + list_builder.nested_builders.emplace_back(std::make_unique(list_builder.impl.init(i, static_cast(nested_array_size)))); + } + else if (element_type.isStruct()) + { + list_builder.nested_builders.emplace_back(std::make_unique(list_builder.impl[i], element_struct_fields)); + } + else + { + list_builder.nested_builders.emplace_back(); + } + } + + auto value = nested_serializer->writeRow(nested_column, list_builder.nested_builders[i].get(), offset + i); + if (value) + list_builder.impl.set(i, *value); + } + + return std::nullopt; + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto list_reader = reader.as(); + auto & column_array = assert_cast(column); + auto & offsets = column_array.getOffsets(); + offsets.push_back(offsets.back() + list_reader.size()); + + auto & nested_column = column_array.getData(); + for (const auto & nested_reader : list_reader) + nested_serializer->readRow(nested_column, nested_reader); + } + + private: + std::unique_ptr nested_serializer; + capnp::Type element_type; + size_t element_struct_fields; + }; + + class CapnProtoMapSerializer : public ICapnProtoSerializer + { + public: + CapnProtoMapSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + /// We output/input Map type as follow CapnProto schema + /// + /// struct Map { + /// struct Entry { + /// key @0: Key; + /// value @1: Value; + /// } + /// entries @0 :List(Entry); + /// } + + if (!capnp_type.isStruct()) + throwCannotConvert(data_type, column_name, capnp_type); + + auto struct_schema = capnp_type.asStruct(); + + if (checkIfStructContainsUnnamedUnion(struct_schema)) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\" with type {} to CapnProto Struct with unnamed union {}", + column_name, + data_type->getName(), + getCapnProtoFullTypeName(capnp_type)); + + if (struct_schema.getFields().size() != 1) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": Map type can be represented as a Struct with one list field, got struct: {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + + const auto & field_type = struct_schema.getFields()[0].getType(); + if (!field_type.isList()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": Map type can be represented as a Struct with one list field, got field: {}", + column_name, + getCapnProtoFullTypeName(field_type)); + + auto list_element_type = field_type.asList().getElementType(); + if (!list_element_type.isStruct()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": Field of struct that represents Map should be a list of structs, got list of {}", + column_name, + getCapnProtoFullTypeName(list_element_type)); + + auto key_value_struct = list_element_type.asStruct(); + if (checkIfStructContainsUnnamedUnion(key_value_struct)) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": struct that represents Map entries is unnamed union: {}", + column_name, + getCapnProtoFullTypeName(list_element_type)); + + if (key_value_struct.getFields().size() != 2) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": struct that represents Map entries should contain only 2 fields, got struct {}", + column_name, + getCapnProtoFullTypeName(list_element_type)); + + const auto & map_type = assert_cast(*data_type); + DataTypes types = {map_type.getKeyType(), map_type.getValueType()}; + Names names = {"key", "value"}; + auto entries_type = std::make_shared(std::make_shared(types, names)); + entries_field = struct_schema.getFields()[0]; + entries_capnp_type = entries_field.getType(); + nested_serializer = createSerializer(entries_type, column_name, field_type, settings); + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder * field_builder, size_t row_num) override + { + assert(field_builder); + auto & struct_builder = assert_cast(*field_builder); + const auto & entries_column = assert_cast(column.get())->getNestedColumnPtr(); + auto entries_builder = initStructFieldBuilderIfNeeded(entries_column, row_num, struct_builder.impl, entries_field, entries_capnp_type, 0); + nested_serializer->writeRow(entries_column, entries_builder.get(), row_num); + return std::nullopt; + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto struct_reader = reader.as(); + auto & entries_column = assert_cast(column).getNestedColumn(); + nested_serializer->readRow(entries_column, struct_reader.get(entries_field)); + } + + private: + std::unique_ptr nested_serializer; + capnp::StructSchema::Field entries_field; + capnp::Type entries_capnp_type; + }; + + class CapnProtoStructureSerializer : public ICapnProtoSerializer + { + public: + CapnProtoStructureSerializer(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) + { + if (checkIfStructIsNamedUnion(schema) || checkIfStructContainsUnnamedUnion(schema)) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Root CapnProto Struct cannot be named union/struct with unnamed union"); + + initialize(data_types, names, schema, settings); + } + + CapnProtoStructureSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + if (!capnp_type.isStruct()) + throwCannotConvert(data_type, column_name, capnp_type); + + auto struct_schema = capnp_type.asStruct(); + + if (checkIfStructIsNamedUnion(struct_schema) || checkIfStructContainsUnnamedUnion(struct_schema)) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\" with type {} to CapnProto named union/struct with unnamed union {}", + column_name, + data_type->getName(), + getCapnProtoFullTypeName(capnp_type)); + + const auto * tuple_data_type = assert_cast(data_type.get()); + auto nested_types = tuple_data_type->getElements(); + Names nested_names; + bool have_explicit_names = tuple_data_type->haveExplicitNames(); + auto structure_fields = struct_schema.getFields(); + if (!have_explicit_names) + { + if (nested_types.size() != structure_fields.size()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\" with type {} to CapnProto type {}: Tuple and Struct have different sizes {} != {}", + column_name, + data_type->getName(), + getCapnProtoFullTypeName(capnp_type), + nested_types.size(), + structure_fields.size()); + nested_names.reserve(structure_fields.size()); + for (auto field : structure_fields) + nested_names.push_back(field.getProto().getName()); + } + else + { + nested_names = tuple_data_type->getElementNames(); + } + + try + { + initialize(nested_types, nested_names, struct_schema, settings); + } + catch (Exception & e) + { + e.addMessage("(while converting column {})", column_name); + throw e; + } + } + + std::optional writeRow(const ColumnPtr & column, FieldBuilder * builder, size_t row_num) override + { + assert(builder); + auto & struct_builder = assert_cast(*builder); + if (auto tuple_column = typeid_cast(column.get())) + writeRow(tuple_column->getColumnsCopy(), struct_builder, row_num); + else + writeRow(Columns{column}, struct_builder, row_num); + return std::nullopt; + } + + void writeRow(const Columns & columns, StructBuilder & struct_builder, size_t row_num) + { + for (size_t i = 0; i != columns.size(); ++i) + { + const auto & field = fields[i]; + size_t field_index = field.getIndex(); + if (likely(!struct_builder.field_builders[field_index])) + struct_builder.field_builders[field_index] = initStructFieldBuilderIfNeeded( + columns[i], row_num, struct_builder.impl, field, fields_types[i], nested_field_sizes[i]); + + auto value = field_serializers[i]->writeRow(columns[i], struct_builder.field_builders[field_index].get(), row_num); + if (value) + struct_builder.impl.set(field, *value); + } + } + + void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + { + auto struct_reader = reader.as(); + if (auto * tuple_column = typeid_cast(&column)) + { + for (size_t i = 0; i != tuple_column->tupleSize(); ++i) + field_serializers[i]->readRow(tuple_column->getColumn(i), struct_reader.get(fields[i])); + } + else + field_serializers[0]->readRow(column, struct_reader.get(fields[0])); + } + + void readRow(MutableColumns & columns, const capnp::DynamicStruct::Reader & reader) + { + for (size_t i = 0; i != columns.size(); ++i) + field_serializers[i]->readRow(*columns[i], reader.get(fields[i])); + } + + private: + void initialize(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) + { + field_serializers.reserve(data_types.size()); + fields.reserve(data_types.size()); + fields_types.reserve(data_types.size()); + nested_field_sizes.reserve(data_types.size()); + for (size_t i = 0; i != data_types.size(); ++i) + { + auto [field_name, _] = splitFieldName(names[i]); + auto field = findFieldByName(schema, field_name); + if (!field) + throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto schema doesn't contain field with name {}", field_name); + + fields.push_back(*field); + auto capnp_type = field->getType(); + fields_types.push_back(capnp_type); + nested_field_sizes.push_back(capnp_type.isStruct() ? capnp_type.asStruct().getFields().size() : 0); + field_serializers.push_back(createSerializer(data_types[i], names[i], capnp_type, settings)); + } + } + + std::vector> field_serializers; + std::vector fields; + std::vector nested_field_sizes; + std::vector fields_types; + }; + + std::unique_ptr createSerializer(const DataTypePtr & type, const String & name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + auto [field_name, nested_name] = splitFieldName(name); + if (!nested_name.empty() && !capnp_type.isList()) + { + if (!capnp_type.isStruct()) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Field {} is not a struct", field_name); + + return std::make_unique(DataTypes{type}, Names{nested_name}, capnp_type.asStruct(), settings); + } + + switch (type->getTypeId()) + { + case TypeIndex::Int8: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::UInt8: + if (isBool(type)) + return createIntegerSerializer(type, name, capnp_type); + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::Int16: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::UInt16: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::Int32: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::UInt32: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::Int64: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::UInt64: + return createIntegerSerializer(type, name, capnp_type); + case TypeIndex::Int128: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::UInt128: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Int256: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::UInt256: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Float32: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Float64: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Date: + return std::make_unique(type, name, capnp_type); + case TypeIndex::Date32: + return std::make_unique(type, name, capnp_type); + case TypeIndex::DateTime: + return std::make_unique(type, name, capnp_type); + case TypeIndex::DateTime64: + return std::make_unique(type, name, capnp_type); + case TypeIndex::Decimal32: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Decimal64: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Decimal128: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::Decimal256: + return std::make_unique>(type, name, capnp_type); + case TypeIndex::IPv4: + return std::make_unique(type, name, capnp_type); + case TypeIndex::IPv6: + return std::make_unique(type, name, capnp_type); + case TypeIndex::UUID: + return std::make_unique(type, name, capnp_type); + case TypeIndex::Enum8: + return std::make_unique>(type, name, capnp_type, settings.enum_comparing_mode); + case TypeIndex::Enum16: + return std::make_unique>(type, name, capnp_type, settings.enum_comparing_mode); + case TypeIndex::String: + if (capnp_type.isData()) + return std::make_unique>(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); + case TypeIndex::FixedString: + if (capnp_type.isData()) + return std::make_unique>(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); + case TypeIndex::LowCardinality: + return std::make_unique(type, name, capnp_type, settings); + case TypeIndex::Nullable: + return std::make_unique(type, name, capnp_type, settings); + case TypeIndex::Array: + return std::make_unique(type, name, capnp_type, settings); + case TypeIndex::Map: + return std::make_unique(type, name, capnp_type, settings); + case TypeIndex::Tuple: + return std::make_unique(type, name, capnp_type, settings); + default: + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Type {} is not supported in CapnProto format", type->getName()); + } + } +} + +class CapnProtoSerializer::Impl +{ +public: + Impl(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) + : struct_serializer(std::make_unique(data_types, names, schema, settings)) + , fields_size(schema.getFields().size()) + { + } + + void writeRow(const Columns & columns, capnp::DynamicStruct::Builder builder, size_t row_num) + { + StructBuilder struct_builder(std::move(builder), fields_size); + struct_serializer->writeRow(columns, struct_builder, row_num); + } + + void readRow(MutableColumns & columns, capnp::DynamicStruct::Reader & reader) + { + struct_serializer->readRow(columns, reader); + } + +private: + std::unique_ptr struct_serializer; + size_t fields_size; +}; + +CapnProtoSerializer::CapnProtoSerializer(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) + : serializer_impl(std::make_unique(data_types, names, schema, settings)) +{ +} + +void CapnProtoSerializer::writeRow(const Columns & columns, capnp::DynamicStruct::Builder builder, size_t row_num) +{ + serializer_impl->writeRow(columns, std::move(builder), row_num); +} + +void CapnProtoSerializer::readRow(MutableColumns & columns, capnp::DynamicStruct::Reader & reader) +{ + serializer_impl->readRow(columns, reader); +} + +CapnProtoSerializer::~CapnProtoSerializer() = default; + +} diff --git a/src/Formats/CapnProtoSerializer.h b/src/Formats/CapnProtoSerializer.h new file mode 100644 index 00000000000..efae797875b --- /dev/null +++ b/src/Formats/CapnProtoSerializer.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class CapnProtoSerializer +{ +public: + CapnProtoSerializer(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings); + + void writeRow(const Columns & columns, capnp::DynamicStruct::Builder builder, size_t row_num); + + void readRow(MutableColumns & columns, capnp::DynamicStruct::Reader & reader); + + ~CapnProtoSerializer(); + +private: + class Impl; + std::unique_ptr serializer_impl; +}; + +} diff --git a/src/Formats/CapnProtoUtils.cpp b/src/Formats/CapnProtoUtils.cpp deleted file mode 100644 index d6c032408bb..00000000000 --- a/src/Formats/CapnProtoUtils.cpp +++ /dev/null @@ -1,734 +0,0 @@ -#include - -#if USE_CAPNP - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int CANNOT_PARSE_CAPN_PROTO_SCHEMA; - extern const int THERE_IS_NO_COLUMN; - extern const int BAD_TYPE_OF_FIELD; - extern const int CAPN_PROTO_BAD_CAST; - extern const int FILE_DOESNT_EXIST; - extern const int UNKNOWN_EXCEPTION; - extern const int INCORRECT_DATA; - extern const int CAPN_PROTO_BAD_TYPE; - extern const int BAD_ARGUMENTS; -} - -std::pair splitCapnProtoFieldName(const String & name) -{ - const auto * begin = name.data(); - const auto * end = name.data() + name.size(); - const auto * it = find_first_symbols<'_', '.'>(begin, end); - String first = String(begin, it); - String second = it == end ? "" : String(it + 1, end); - return {first, second}; -} - -capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaInfo & schema_info) -{ - capnp::ParsedSchema schema; - try - { - int fd; - KJ_SYSCALL(fd = open(schema_info.schemaDirectory().data(), O_RDONLY)); // NOLINT(bugprone-suspicious-semicolon) - auto schema_dir = kj::newDiskDirectory(kj::OsFileHandle(fd)); - schema = impl.parseFromDirectory(*schema_dir, kj::Path::parse(schema_info.schemaPath()), {}); - } - catch (const kj::Exception & e) - { - /// That's not good to determine the type of error by its description, but - /// this is the only way to do it here, because kj doesn't specify the type of error. - auto description = std::string_view(e.getDescription().cStr()); - if (description.find("No such file or directory") != String::npos || description.find("no such directory") != String::npos) - throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot open CapnProto schema, file {} doesn't exists", schema_info.absoluteSchemaPath()); - - if (description.find("Parse error") != String::npos) - throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA, "Cannot parse CapnProto schema {}:{}", schema_info.schemaPath(), e.getLine()); - - throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, - "Unknown exception while parsing CapnProto schema: {}, schema dir and file: {}, {}", - description, schema_info.schemaDirectory(), schema_info.schemaPath()); - } - - auto message_maybe = schema.findNested(schema_info.messageName()); - auto * message_schema = kj::_::readMaybe(message_maybe); - if (!message_schema) - throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA, - "CapnProto schema doesn't contain message with name {}", schema_info.messageName()); - return message_schema->asStruct(); -} - -bool compareEnumNames(const String & first, const String & second, FormatSettings::EnumComparingMode mode) -{ - if (mode == FormatSettings::EnumComparingMode::BY_NAMES_CASE_INSENSITIVE) - return boost::algorithm::to_lower_copy(first) == boost::algorithm::to_lower_copy(second); - return first == second; -} - -static const std::map capnp_simple_type_names = -{ - {capnp::schema::Type::Which::BOOL, "Bool"}, - {capnp::schema::Type::Which::VOID, "Void"}, - {capnp::schema::Type::Which::INT8, "Int8"}, - {capnp::schema::Type::Which::INT16, "Int16"}, - {capnp::schema::Type::Which::INT32, "Int32"}, - {capnp::schema::Type::Which::INT64, "Int64"}, - {capnp::schema::Type::Which::UINT8, "UInt8"}, - {capnp::schema::Type::Which::UINT16, "UInt16"}, - {capnp::schema::Type::Which::UINT32, "UInt32"}, - {capnp::schema::Type::Which::UINT64, "UInt64"}, - {capnp::schema::Type::Which::FLOAT32, "Float32"}, - {capnp::schema::Type::Which::FLOAT64, "Float64"}, - {capnp::schema::Type::Which::TEXT, "Text"}, - {capnp::schema::Type::Which::DATA, "Data"}, - {capnp::schema::Type::Which::INTERFACE, "Interface"}, - {capnp::schema::Type::Which::ANY_POINTER, "AnyPointer"}, -}; - -static bool checkIfStructContainsUnnamedUnion(const capnp::StructSchema & struct_schema) -{ - return struct_schema.getFields().size() != struct_schema.getNonUnionFields().size(); -} - -static bool checkIfStructIsNamedUnion(const capnp::StructSchema & struct_schema) -{ - return struct_schema.getFields().size() == struct_schema.getUnionFields().size(); -} - -/// Get full name of type for better exception messages. -static String getCapnProtoFullTypeName(const capnp::Type & type) -{ - switch (type.which()) - { - case capnp::schema::Type::Which::STRUCT: - { - auto struct_schema = type.asStruct(); - - auto non_union_fields = struct_schema.getNonUnionFields(); - std::vector non_union_field_names; - for (auto nested_field : non_union_fields) - non_union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType())); - - auto union_fields = struct_schema.getUnionFields(); - std::vector union_field_names; - for (auto nested_field : union_fields) - union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType())); - - String union_name = "Union(" + boost::algorithm::join(union_field_names, ", ") + ")"; - /// Check if the struct is a named union. - if (non_union_field_names.empty()) - return union_name; - - String type_name = "Struct(" + boost::algorithm::join(non_union_field_names, ", "); - /// Check if the struct contains unnamed union. - if (!union_field_names.empty()) - type_name += ", " + union_name; - type_name += ")"; - return type_name; - } - case capnp::schema::Type::Which::LIST: - return "List(" + getCapnProtoFullTypeName(type.asList().getElementType()) + ")"; - case capnp::schema::Type::Which::ENUM: - { - auto enum_schema = type.asEnum(); - String enum_name = "Enum("; - auto enumerants = enum_schema.getEnumerants(); - for (unsigned i = 0; i != enumerants.size(); ++i) - { - enum_name += String(enumerants[i].getProto().getName()) + " = " + std::to_string(enumerants[i].getOrdinal()); - if (i + 1 != enumerants.size()) - enum_name += ", "; - } - enum_name += ")"; - return enum_name; - } - default: - auto it = capnp_simple_type_names.find(type.which()); - if (it == capnp_simple_type_names.end()) - throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unknown CapnProto type"); - return it->second; - } -} - -template -static bool checkEnums(const capnp::Type & capnp_type, const DataTypePtr column_type, FormatSettings::EnumComparingMode mode, UInt64 max_value, String & error_message) -{ - if (!capnp_type.isEnum()) - return false; - - auto enum_schema = capnp_type.asEnum(); - bool to_lower = mode == FormatSettings::EnumComparingMode::BY_NAMES_CASE_INSENSITIVE; - const auto * enum_type = assert_cast *>(column_type.get()); - const auto & enum_values = dynamic_cast &>(*enum_type); - - auto enumerants = enum_schema.getEnumerants(); - if (mode == FormatSettings::EnumComparingMode::BY_VALUES) - { - /// In CapnProto Enum fields are numbered sequentially starting from zero. - if (enumerants.size() > max_value) - { - error_message += "Enum from CapnProto schema contains values that is out of range for Clickhouse Enum"; - return false; - } - - auto values = enum_values.getSetOfAllValues(); - std::unordered_set capn_enum_values; - for (auto enumerant : enumerants) - capn_enum_values.insert(Type(enumerant.getOrdinal())); - auto result = values == capn_enum_values; - if (!result) - error_message += "The set of values in Enum from CapnProto schema is different from the set of values in ClickHouse Enum"; - return result; - } - - auto names = enum_values.getSetOfAllNames(to_lower); - std::unordered_set capn_enum_names; - - for (auto enumerant : enumerants) - { - String name = enumerant.getProto().getName(); - capn_enum_names.insert(to_lower ? boost::algorithm::to_lower_copy(name) : name); - } - - auto result = names == capn_enum_names; - if (!result) - error_message += "The set of names in Enum from CapnProto schema is different from the set of names in ClickHouse Enum"; - return result; -} - -static bool checkCapnProtoType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message, const String & column_name); - -static bool checkNullableType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message, const String & column_name) -{ - if (!capnp_type.isStruct()) - return false; - - /// Check that struct is a named union of type VOID and one arbitrary type. - auto struct_schema = capnp_type.asStruct(); - if (!checkIfStructIsNamedUnion(struct_schema)) - return false; - - auto union_fields = struct_schema.getUnionFields(); - if (union_fields.size() != 2) - return false; - - auto first = union_fields[0]; - auto second = union_fields[1]; - - auto nested_type = assert_cast(data_type.get())->getNestedType(); - if (first.getType().isVoid()) - return checkCapnProtoType(second.getType(), nested_type, mode, error_message, column_name); - if (second.getType().isVoid()) - return checkCapnProtoType(first.getType(), nested_type, mode, error_message, column_name); - return false; -} - -static bool checkTupleType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message) -{ - if (!capnp_type.isStruct()) - return false; - auto struct_schema = capnp_type.asStruct(); - - if (checkIfStructIsNamedUnion(struct_schema)) - return false; - - if (checkIfStructContainsUnnamedUnion(struct_schema)) - { - error_message += "CapnProto struct contains unnamed union"; - return false; - } - - const auto * tuple_data_type = assert_cast(data_type.get()); - auto nested_types = tuple_data_type->getElements(); - if (nested_types.size() != struct_schema.getFields().size()) - { - error_message += "Tuple and Struct types have different sizes"; - return false; - } - - bool have_explicit_names = tuple_data_type->haveExplicitNames(); - const auto & nested_names = tuple_data_type->getElementNames(); - for (uint32_t i = 0; i != nested_names.size(); ++i) - { - if (have_explicit_names) - { - KJ_IF_MAYBE (field, struct_schema.findFieldByName(nested_names[i])) - { - if (!checkCapnProtoType(field->getType(), nested_types[tuple_data_type->getPositionByName(nested_names[i])], mode, error_message, nested_names[i])) - return false; - } - else - { - error_message += "CapnProto struct doesn't contain a field with name " + nested_names[i]; - return false; - } - } - else if (!checkCapnProtoType(struct_schema.getFields()[i].getType(), nested_types[tuple_data_type->getPositionByName(nested_names[i])], mode, error_message, nested_names[i])) - return false; - } - - return true; -} - -static bool checkArrayType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message, const String & column_name) -{ - if (!capnp_type.isList()) - return false; - auto list_schema = capnp_type.asList(); - auto nested_type = assert_cast(data_type.get())->getNestedType(); - - auto [field_name, nested_name] = splitCapnProtoFieldName(column_name); - if (!nested_name.empty() && list_schema.getElementType().isStruct()) - { - auto struct_schema = list_schema.getElementType().asStruct(); - KJ_IF_MAYBE(field, struct_schema.findFieldByName(nested_name)) - return checkCapnProtoType(field->getType(), nested_type, mode, error_message, nested_name); - - error_message += "Element type of List {} doesn't contain field with name " + nested_name; - return false; - } - - return checkCapnProtoType(list_schema.getElementType(), nested_type, mode, error_message, column_name); -} - -static bool checkMapType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message) -{ - /// We output/input Map type as follow CapnProto schema - /// - /// struct Map { - /// struct Entry { - /// key @0: Key; - /// value @1: Value; - /// } - /// entries @0 :List(Entry); - /// } - - if (!capnp_type.isStruct()) - return false; - auto struct_schema = capnp_type.asStruct(); - - if (checkIfStructContainsUnnamedUnion(struct_schema)) - { - error_message += "CapnProto struct contains unnamed union"; - return false; - } - - if (struct_schema.getFields().size() != 1) - { - error_message += "CapnProto struct that represents Map type can contain only one field"; - return false; - } - - const auto & field_type = struct_schema.getFields()[0].getType(); - if (!field_type.isList()) - { - error_message += "Field of CapnProto struct that represents Map is not a list"; - return false; - } - - auto list_element_type = field_type.asList().getElementType(); - if (!list_element_type.isStruct()) - { - error_message += "Field of CapnProto struct that represents Map is not a list of structs"; - return false; - } - - auto key_value_struct = list_element_type.asStruct(); - if (checkIfStructContainsUnnamedUnion(key_value_struct)) - { - error_message += "CapnProto struct contains unnamed union"; - return false; - } - - if (key_value_struct.getFields().size() != 2) - { - error_message += "Key-value structure for Map struct should have exactly 2 fields"; - return false; - } - - const auto & map_type = assert_cast(*data_type); - DataTypes types = {map_type.getKeyType(), map_type.getValueType()}; - Names names = {"key", "value"}; - - for (size_t i = 0; i != types.size(); ++i) - { - KJ_IF_MAYBE(field, key_value_struct.findFieldByName(names[i])) - { - if (!checkCapnProtoType(field->getType(), types[i], mode, error_message, names[i])) - return false; - } - else - { - error_message += R"(Key-value structure for Map struct should have exactly 2 fields with names "key" and "value")"; - return false; - } - } - - return true; -} - -static bool isCapnInteger(const capnp::Type & capnp_type) -{ - return capnp_type.isInt8() || capnp_type.isUInt8() || capnp_type.isInt16() || capnp_type.isUInt16() || capnp_type.isInt32() - || capnp_type.isUInt32() || capnp_type.isInt64() || capnp_type.isUInt64(); -} - -static bool checkCapnProtoType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message, const String & column_name) -{ - switch (data_type->getTypeId()) - { - case TypeIndex::UInt8: - return capnp_type.isBool() || isCapnInteger(capnp_type); - case TypeIndex::Int8: [[fallthrough]]; - case TypeIndex::Int16: [[fallthrough]]; - case TypeIndex::UInt16: [[fallthrough]]; - case TypeIndex::Int32: [[fallthrough]]; - case TypeIndex::UInt32: [[fallthrough]]; - case TypeIndex::Int64: [[fallthrough]]; - case TypeIndex::UInt64: - /// Allow integer conversions durin input/output. - return isCapnInteger(capnp_type); - case TypeIndex::Date: - return capnp_type.isUInt16(); - case TypeIndex::DateTime: [[fallthrough]]; - case TypeIndex::IPv4: - return capnp_type.isUInt32(); - case TypeIndex::Date32: [[fallthrough]]; - case TypeIndex::Decimal32: - return capnp_type.isInt32() || capnp_type.isUInt32(); - case TypeIndex::DateTime64: [[fallthrough]]; - case TypeIndex::Decimal64: - return capnp_type.isInt64() || capnp_type.isUInt64(); - case TypeIndex::Float32:[[fallthrough]]; - case TypeIndex::Float64: - /// Allow converting between Float32 and isFloat64 - return capnp_type.isFloat32() || capnp_type.isFloat64(); - case TypeIndex::Enum8: - return checkEnums(capnp_type, data_type, mode, INT8_MAX, error_message); - case TypeIndex::Enum16: - return checkEnums(capnp_type, data_type, mode, INT16_MAX, error_message); - case TypeIndex::Int128: [[fallthrough]]; - case TypeIndex::UInt128: [[fallthrough]]; - case TypeIndex::Int256: [[fallthrough]]; - case TypeIndex::UInt256: [[fallthrough]]; - case TypeIndex::Decimal128: [[fallthrough]]; - case TypeIndex::Decimal256: - return capnp_type.isData(); - case TypeIndex::Tuple: - return checkTupleType(capnp_type, data_type, mode, error_message); - case TypeIndex::Nullable: - { - auto result = checkNullableType(capnp_type, data_type, mode, error_message, column_name); - if (!result) - error_message += "Nullable can be represented only as a named union of type Void and nested type"; - return result; - } - case TypeIndex::Array: - return checkArrayType(capnp_type, data_type, mode, error_message, column_name); - case TypeIndex::LowCardinality: - return checkCapnProtoType(capnp_type, assert_cast(data_type.get())->getDictionaryType(), mode, error_message, column_name); - case TypeIndex::FixedString: [[fallthrough]]; - case TypeIndex::IPv6: [[fallthrough]]; - case TypeIndex::String: - return capnp_type.isText() || capnp_type.isData(); - case TypeIndex::Map: - return checkMapType(capnp_type, data_type, mode, error_message); - default: - return false; - } -} - -capnp::DynamicValue::Reader getReaderByColumnName(const capnp::DynamicStruct::Reader & struct_reader, const String & name) -{ - auto [field_name, nested_name] = splitCapnProtoFieldName(name); - KJ_IF_MAYBE(field, struct_reader.getSchema().findFieldByName(field_name)) - { - capnp::DynamicValue::Reader field_reader; - try - { - field_reader = struct_reader.get(*field); - } - catch (const kj::Exception & e) - { - throw Exception(ErrorCodes::INCORRECT_DATA, - "Cannot extract field value from struct by provided schema, error: " - "{} Perhaps the data was generated by another schema", String(e.getDescription().cStr())); - } - - if (nested_name.empty()) - return field_reader; - - /// Support reading Nested as List of Structs. - if (field_reader.getType() == capnp::DynamicValue::LIST) - { - auto list_schema = field->getType().asList(); - if (!list_schema.getElementType().isStruct()) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Element type of List {} is not a struct", field_name); - - auto struct_schema = list_schema.getElementType().asStruct(); - KJ_IF_MAYBE(nested_field, struct_schema.findFieldByName(nested_name)) - return field_reader; - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Element type of List {} doesn't contain field with name \"{}\"", field_name, nested_name); - } - - if (field_reader.getType() != capnp::DynamicValue::STRUCT) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Field {} is not a struct", field_name); - - return getReaderByColumnName(field_reader.as(), nested_name); - } - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto struct doesn't contain field with name {}", field_name); -} - -std::pair getStructBuilderAndFieldByColumnName(capnp::DynamicStruct::Builder struct_builder, const String & name) -{ - auto [field_name, nested_name] = splitCapnProtoFieldName(name); - KJ_IF_MAYBE(field, struct_builder.getSchema().findFieldByName(field_name)) - { - if (nested_name.empty()) - return {struct_builder, *field}; - - auto field_builder = struct_builder.get(*field); - - /// Support reading Nested as List of Structs. - if (field_builder.getType() == capnp::DynamicValue::LIST) - { - auto list_schema = field->getType().asList(); - if (!list_schema.getElementType().isStruct()) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Element type of List {} is not a struct", field_name); - - auto struct_schema = list_schema.getElementType().asStruct(); - KJ_IF_MAYBE(nested_field, struct_schema.findFieldByName(nested_name)) - return {struct_builder, *field}; - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Element type of List {} doesn't contain field with name \"{}\"", field_name, nested_name); - } - - if (field_builder.getType() != capnp::DynamicValue::STRUCT) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Field {} is not a struct", field_name); - - return getStructBuilderAndFieldByColumnName(field_builder.as(), nested_name); - } - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto struct doesn't contain field with name {}", field_name); -} - -static std::pair getFieldByName(const capnp::StructSchema & schema, const String & name) -{ - auto [field_name, nested_name] = splitCapnProtoFieldName(name); - KJ_IF_MAYBE(field, schema.findFieldByName(field_name)) - { - if (nested_name.empty()) - return {*field, name}; - - /// Support reading Nested as List of Structs. - if (field->getType().isList()) - { - auto list_schema = field->getType().asList(); - if (!list_schema.getElementType().isStruct()) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Element type of List {} is not a struct", field_name); - - auto struct_schema = list_schema.getElementType().asStruct(); - KJ_IF_MAYBE(nested_field, struct_schema.findFieldByName(nested_name)) - return {*field, name}; - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Element type of List {} doesn't contain field with name \"{}\"", field_name, nested_name); - } - - if (!field->getType().isStruct()) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Field {} is not a struct", field_name); - - return getFieldByName(field->getType().asStruct(), nested_name); - } - - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto schema doesn't contain field with name {}", field_name); -} - -void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Block & header, FormatSettings::EnumComparingMode mode) -{ - /// Firstly check that struct doesn't contain unnamed union, because we don't support it. - if (checkIfStructContainsUnnamedUnion(schema)) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Schema contains unnamed union that is not supported"); - auto names_and_types = header.getNamesAndTypesList(); - String additional_error_message; - for (auto & [name, type] : names_and_types) - { - auto [field, field_name] = getFieldByName(schema, name); - if (!checkCapnProtoType(field.getType(), type, mode, additional_error_message, field_name)) - { - auto e = Exception( - ErrorCodes::CAPN_PROTO_BAD_CAST, - "Cannot convert ClickHouse type {} to CapnProto type {}", - type->getName(), - getCapnProtoFullTypeName(field.getType())); - if (!additional_error_message.empty()) - e.addMessage(additional_error_message); - throw std::move(e); - } - } -} - -template -static DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants) -{ - std::vector> values; - for (auto enumerant : enumerants) - values.emplace_back(enumerant.getProto().getName(), ValueType(enumerant.getOrdinal())); - return std::make_shared>(std::move(values)); -} - -static DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema) -{ - auto enumerants = enum_schema.getEnumerants(); - if (enumerants.size() < 128) - return getEnumDataTypeFromEnumerants(enumerants); - if (enumerants.size() < 32768) - return getEnumDataTypeFromEnumerants(enumerants); - - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "ClickHouse supports only 8 and 16-bit Enums"); -} - -static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type, bool skip_unsupported_fields) -{ - switch (capnp_type.which()) - { - case capnp::schema::Type::INT8: - return std::make_shared(); - case capnp::schema::Type::INT16: - return std::make_shared(); - case capnp::schema::Type::INT32: - return std::make_shared(); - case capnp::schema::Type::INT64: - return std::make_shared(); - case capnp::schema::Type::BOOL: [[fallthrough]]; - case capnp::schema::Type::UINT8: - return std::make_shared(); - case capnp::schema::Type::UINT16: - return std::make_shared(); - case capnp::schema::Type::UINT32: - return std::make_shared(); - case capnp::schema::Type::UINT64: - return std::make_shared(); - case capnp::schema::Type::FLOAT32: - return std::make_shared(); - case capnp::schema::Type::FLOAT64: - return std::make_shared(); - case capnp::schema::Type::DATA: [[fallthrough]]; - case capnp::schema::Type::TEXT: - return std::make_shared(); - case capnp::schema::Type::ENUM: - return getEnumDataTypeFromEnumSchema(capnp_type.asEnum()); - case capnp::schema::Type::LIST: - { - auto list_schema = capnp_type.asList(); - auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType(), skip_unsupported_fields); - if (!nested_type) - return nullptr; - return std::make_shared(nested_type); - } - case capnp::schema::Type::STRUCT: - { - auto struct_schema = capnp_type.asStruct(); - - - if (struct_schema.getFields().size() == 0) - { - if (skip_unsupported_fields) - return nullptr; - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Empty messages are not supported"); - } - - /// Check if it can be Nullable. - if (checkIfStructIsNamedUnion(struct_schema)) - { - auto fields = struct_schema.getUnionFields(); - if (fields.size() != 2 || (!fields[0].getType().isVoid() && !fields[1].getType().isVoid())) - { - if (skip_unsupported_fields) - return nullptr; - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unions are not supported"); - } - auto value_type = fields[0].getType().isVoid() ? fields[1].getType() : fields[0].getType(); - if (value_type.isStruct() || value_type.isList()) - { - if (skip_unsupported_fields) - return nullptr; - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Tuples and Lists cannot be inside Nullable"); - } - - auto nested_type = getDataTypeFromCapnProtoType(value_type, skip_unsupported_fields); - if (!nested_type) - return nullptr; - return std::make_shared(nested_type); - } - - if (checkIfStructContainsUnnamedUnion(struct_schema)) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); - - /// Treat Struct as Tuple. - DataTypes nested_types; - Names nested_names; - for (auto field : struct_schema.getNonUnionFields()) - { - auto nested_type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields); - if (!nested_type) - continue; - nested_names.push_back(field.getProto().getName()); - nested_types.push_back(nested_type); - } - if (nested_types.empty()) - return nullptr; - return std::make_shared(std::move(nested_types), std::move(nested_names)); - } - default: - { - if (skip_unsupported_fields) - return nullptr; - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unsupported CapnProtoType: {}", getCapnProtoFullTypeName(capnp_type)); - } - } -} - -NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields) -{ - if (checkIfStructContainsUnnamedUnion(schema)) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); - - NamesAndTypesList names_and_types; - for (auto field : schema.getNonUnionFields()) - { - auto name = field.getProto().getName(); - auto type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields); - if (type) - names_and_types.emplace_back(name, type); - } - if (names_and_types.empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Cannot convert CapnProto schema to ClickHouse table schema, all fields have unsupported types"); - - return names_and_types; -} - -} - -#endif diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index c88af650671..475d08e0fe3 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -325,16 +325,16 @@ struct FormatSettings /// For capnProto format we should determine how to /// compare ClickHouse Enum and Enum from schema. - enum class EnumComparingMode + enum class CapnProtoEnumComparingMode { BY_NAMES, // Names in enums should be the same, values can be different. BY_NAMES_CASE_INSENSITIVE, // Case-insensitive name comparison. BY_VALUES, // Values should be the same, names can be different. }; - struct + struct CapnProto { - EnumComparingMode enum_comparing_mode = EnumComparingMode::BY_VALUES; + CapnProtoEnumComparingMode enum_comparing_mode = CapnProtoEnumComparingMode::BY_VALUES; bool skip_fields_with_unsupported_types_in_schema_inference = false; } capn_proto; diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp index 2f84e9bde3c..e686ae86997 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp @@ -9,23 +9,6 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - namespace DB { @@ -35,16 +18,14 @@ namespace ErrorCodes extern const int INCORRECT_DATA; } -CapnProtoRowInputFormat::CapnProtoRowInputFormat(ReadBuffer & in_, Block header, Params params_, const FormatSchemaInfo & info, const FormatSettings & format_settings_) - : IRowInputFormat(std::move(header), in_, std::move(params_)) +CapnProtoRowInputFormat::CapnProtoRowInputFormat(ReadBuffer & in_, Block header_, Params params_, const FormatSchemaInfo & info, const FormatSettings & format_settings) + : IRowInputFormat(std::move(header_), in_, std::move(params_)) , parser(std::make_shared()) - , format_settings(format_settings_) - , column_types(getPort().getHeader().getDataTypes()) - , column_names(getPort().getHeader().getNames()) { // Parse the schema and fetch the root object - root = parser->getMessageSchema(info); - checkCapnProtoSchemaStructure(root, getPort().getHeader(), format_settings.capn_proto.enum_comparing_mode); + schema = parser->getMessageSchema(info); + const auto & header = getPort().getHeader(); + serializer = std::make_unique(header.getDataTypes(), header.getNames(), schema, format_settings.capn_proto); } kj::Array CapnProtoRowInputFormat::readMessage() @@ -82,213 +63,6 @@ kj::Array CapnProtoRowInputFormat::readMessage() return msg; } -static void insertInteger(IColumn & column, const DataTypePtr & column_type, UInt64 value) -{ - switch (column_type->getTypeId()) - { - case TypeIndex::Int8: - assert_cast(column).insertValue(value); - break; - case TypeIndex::UInt8: - assert_cast(column).insertValue(value); - break; - case TypeIndex::Int16: - assert_cast(column).insertValue(value); - break; - case TypeIndex::Date: [[fallthrough]]; - case TypeIndex::UInt16: - assert_cast(column).insertValue(value); - break; - case TypeIndex::Int32: - assert_cast(column).insertValue(static_cast(value)); - break; - case TypeIndex::DateTime: [[fallthrough]]; - case TypeIndex::UInt32: - assert_cast(column).insertValue(static_cast(value)); - break; - case TypeIndex::IPv4: - assert_cast(column).insertValue(IPv4(static_cast(value))); - break; - case TypeIndex::Int64: - assert_cast(column).insertValue(value); - break; - case TypeIndex::UInt64: - assert_cast(column).insertValue(value); - break; - case TypeIndex::DateTime64: - assert_cast &>(column).insertValue(value); - break; - case TypeIndex::Decimal32: - assert_cast &>(column).insertValue(static_cast(value)); - break; - case TypeIndex::Decimal64: - assert_cast &>(column).insertValue(value); - break; - default: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Column type {} cannot be parsed from integer", column_type->getName()); - } -} - -static void insertFloat(IColumn & column, const DataTypePtr & column_type, Float64 value) -{ - switch (column_type->getTypeId()) - { - case TypeIndex::Float32: - assert_cast(column).insertValue(static_cast(value)); - break; - case TypeIndex::Float64: - assert_cast(column).insertValue(value); - break; - default: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Column type is not a float."); - } -} - -template -static void insertData(IColumn & column, const DataTypePtr & column_type, Value value) -{ - if (column_type->haveMaximumSizeOfValue() && value.size() != column_type->getSizeOfValueInMemory()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", column_type->getName(), value.size()); - - column.insertData(reinterpret_cast(value.begin()), value.size()); -} - -template -static void insertEnum(IColumn & column, const DataTypePtr & column_type, const capnp::DynamicEnum & enum_value, FormatSettings::EnumComparingMode enum_comparing_mode) -{ - auto enumerant = *kj::_::readMaybe(enum_value.getEnumerant()); - auto enum_type = assert_cast *>(column_type.get()); - DataTypePtr nested_type = std::make_shared>(); - switch (enum_comparing_mode) - { - case FormatSettings::EnumComparingMode::BY_VALUES: - insertInteger(column, nested_type, Int64(enumerant.getOrdinal())); - return; - case FormatSettings::EnumComparingMode::BY_NAMES: - insertInteger(column, nested_type, Int64(enum_type->getValue(String(enumerant.getProto().getName())))); - return; - case FormatSettings::EnumComparingMode::BY_NAMES_CASE_INSENSITIVE: - { - /// Find the same enum name case insensitive. - String enum_name = enumerant.getProto().getName(); - for (auto & name : enum_type->getAllRegisteredNames()) - { - if (compareEnumNames(name, enum_name, enum_comparing_mode)) - { - insertInteger(column, nested_type, Int64(enum_type->getValue(name))); - break; - } - } - } - } -} - -static void insertValue(IColumn & column, const DataTypePtr & column_type, const String & column_name, const capnp::DynamicValue::Reader & value, FormatSettings::EnumComparingMode enum_comparing_mode) -{ - if (column_type->lowCardinality()) - { - auto & lc_column = assert_cast(column); - auto tmp_column = lc_column.getDictionary().getNestedColumn()->cloneEmpty(); - auto dict_type = assert_cast(column_type.get())->getDictionaryType(); - insertValue(*tmp_column, dict_type, column_name, value, enum_comparing_mode); - lc_column.insertFromFullColumn(*tmp_column, 0); - return; - } - - switch (value.getType()) - { - case capnp::DynamicValue::Type::INT: - insertInteger(column, column_type, value.as()); - break; - case capnp::DynamicValue::Type::UINT: - insertInteger(column, column_type, value.as()); - break; - case capnp::DynamicValue::Type::FLOAT: - insertFloat(column, column_type, value.as()); - break; - case capnp::DynamicValue::Type::BOOL: - insertInteger(column, column_type, UInt64(value.as())); - break; - case capnp::DynamicValue::Type::DATA: - insertData(column, column_type, value.as()); - break; - case capnp::DynamicValue::Type::TEXT: - insertData(column, column_type, value.as()); - break; - case capnp::DynamicValue::Type::ENUM: - if (column_type->getTypeId() == TypeIndex::Enum8) - insertEnum(column, column_type, value.as(), enum_comparing_mode); - else - insertEnum(column, column_type, value.as(), enum_comparing_mode); - break; - case capnp::DynamicValue::LIST: - { - auto list_value = value.as(); - auto & column_array = assert_cast(column); - auto & offsets = column_array.getOffsets(); - offsets.push_back(offsets.back() + list_value.size()); - - auto & nested_column = column_array.getData(); - auto nested_type = assert_cast(column_type.get())->getNestedType(); - for (const auto & nested_value : list_value) - insertValue(nested_column, nested_type, column_name, nested_value, enum_comparing_mode); - break; - } - case capnp::DynamicValue::Type::STRUCT: - { - auto struct_value = value.as(); - if (column_type->isNullable()) - { - auto & nullable_column = assert_cast(column); - auto field = *kj::_::readMaybe(struct_value.which()); - if (field.getType().isVoid()) - nullable_column.insertDefault(); - else - { - auto & nested_column = nullable_column.getNestedColumn(); - auto nested_type = assert_cast(column_type.get())->getNestedType(); - auto nested_value = struct_value.get(field); - insertValue(nested_column, nested_type, column_name, nested_value, enum_comparing_mode); - nullable_column.getNullMapData().push_back(0); - } - } - else if (isTuple(column_type)) - { - auto & tuple_column = assert_cast(column); - const auto * tuple_type = assert_cast(column_type.get()); - bool have_explicit_names = tuple_type->haveExplicitNames(); - auto struct_schema = struct_value.getSchema(); - for (uint32_t i = 0; i != tuple_column.tupleSize(); ++i) - insertValue( - tuple_column.getColumn(i), - tuple_type->getElements()[i], - tuple_type->getElementNames()[i], - struct_value.get(have_explicit_names ? struct_schema.getFieldByName(tuple_type->getElementNames()[i]) : struct_schema.getFields()[i]), - enum_comparing_mode); - } - else if (isMap(column_type)) - { - const auto & map_type = assert_cast(*column_type); - DataTypes key_value_types = {map_type.getKeyType(), map_type.getValueType()}; - Names key_value_names = {"key", "value"}; - auto entries_type = std::make_shared(std::make_shared(key_value_types, key_value_names)); - auto & entries_column = assert_cast(column).getNestedColumn(); - auto entries_field = struct_value.getSchema().getFields()[0]; - insertValue(entries_column, entries_type, column_name, struct_value.get(entries_field), enum_comparing_mode); - } - else - { - /// It can be nested column from Nested type. - auto [field_name, nested_name] = splitCapnProtoFieldName(column_name); - insertValue(column, column_type, nested_name, struct_value.get(nested_name), enum_comparing_mode); - } - break; - } - default: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected CapnProto value type."); - } -} - bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) { if (in->eof()) @@ -298,12 +72,8 @@ bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension { auto array = readMessage(); capnp::FlatArrayMessageReader msg(array); - auto root_reader = msg.getRoot(root); - for (size_t i = 0; i != columns.size(); ++i) - { - auto value = getReaderByColumnName(root_reader, column_names[i]); - insertValue(*columns[i], column_types[i], column_names[i], value, format_settings.capn_proto.enum_comparing_mode); - } + auto root_reader = msg.getRoot(schema); + serializer->readRow(columns, root_reader); } catch (const kj::Exception & e) { @@ -343,7 +113,14 @@ void registerInputFormatCapnProto(FormatFactory & factory) factory.markFormatSupportsSubsetOfColumns("CapnProto"); factory.registerFileExtension("capnp", "CapnProto"); factory.registerAdditionalInfoForSchemaCacheGetter( - "CapnProto", [](const FormatSettings & settings) { return fmt::format("format_schema={}", settings.schema.format_schema); }); + "CapnProto", + [](const FormatSettings & settings) + { + return fmt::format( + "format_schema={}, skip_fields_with_unsupported_types_in_schema_inference={}", + settings.schema.format_schema, + settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference); + }); } void registerCapnProtoSchemaReader(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h index cf23f22b643..06e94da123f 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h @@ -4,7 +4,8 @@ #if USE_CAPNP #include -#include +#include +#include #include #include @@ -33,10 +34,8 @@ private: kj::Array readMessage(); std::shared_ptr parser; - capnp::StructSchema root; - const FormatSettings format_settings; - DataTypes column_types; - Names column_names; + capnp::StructSchema schema; + std::unique_ptr serializer; }; class CapnProtoSchemaReader : public IExternalSchemaReader diff --git a/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp b/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp index 0225680b396..7dd18be27f4 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp @@ -1,28 +1,13 @@ #include #if USE_CAPNP -#include +#include #include +#include #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - namespace DB { @@ -45,252 +30,25 @@ CapnProtoRowOutputFormat::CapnProtoRowOutputFormat( WriteBuffer & out_, const Block & header_, const FormatSchemaInfo & info, - const FormatSettings & format_settings_) - : IRowOutputFormat(header_, out_), column_names(header_.getNames()), column_types(header_.getDataTypes()), output_stream(std::make_unique(out_)), format_settings(format_settings_) + const FormatSettings & format_settings) + : IRowOutputFormat(header_, out_) + , column_names(header_.getNames()) + , column_types(header_.getDataTypes()) + , output_stream(std::make_unique(out_)) { schema = schema_parser.getMessageSchema(info); - checkCapnProtoSchemaStructure(schema, getPort(PortKind::Main).getHeader(), format_settings.capn_proto.enum_comparing_mode); -} - -template -static capnp::DynamicEnum getDynamicEnum( - const ColumnPtr & column, - const DataTypePtr & data_type, - size_t row_num, - const capnp::EnumSchema & enum_schema, - FormatSettings::EnumComparingMode mode) -{ - const auto * enum_data_type = assert_cast *>(data_type.get()); - EnumValue enum_value = column->getInt(row_num); - if (mode == FormatSettings::EnumComparingMode::BY_VALUES) - return capnp::DynamicEnum(enum_schema, enum_value); - - auto enum_name = enum_data_type->getNameForValue(enum_value); - for (const auto enumerant : enum_schema.getEnumerants()) - { - if (compareEnumNames(String(enum_name), enumerant.getProto().getName(), mode)) - return capnp::DynamicEnum(enumerant); - } - - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot convert CLickHouse Enum value to CapnProto Enum"); -} - -static capnp::DynamicValue::Builder initStructFieldBuilder(const ColumnPtr & column, size_t row_num, capnp::DynamicStruct::Builder & struct_builder, capnp::StructSchema::Field field) -{ - if (const auto * array_column = checkAndGetColumn(*column)) - { - size_t size = array_column->getOffsets()[row_num] - array_column->getOffsets()[row_num - 1]; - return struct_builder.init(field, static_cast(size)); - } - - if (field.getType().isStruct()) - return struct_builder.init(field); - - return struct_builder.get(field); -} - -static std::optional convertToDynamicValue( - const ColumnPtr & column, - const DataTypePtr & data_type, - size_t row_num, - const String & column_name, - capnp::DynamicValue::Builder builder, - FormatSettings::EnumComparingMode enum_comparing_mode, - std::vector> & temporary_text_data_storage) -{ - /// Here we don't do any types validation, because we did it in CapnProtoRowOutputFormat constructor. - - if (data_type->lowCardinality()) - { - const auto * lc_column = assert_cast(column.get()); - const auto & dict_type = assert_cast(data_type.get())->getDictionaryType(); - size_t index = lc_column->getIndexAt(row_num); - return convertToDynamicValue(lc_column->getDictionary().getNestedColumn(), dict_type, index, column_name, builder, enum_comparing_mode, temporary_text_data_storage); - } - - switch (builder.getType()) - { - case capnp::DynamicValue::Type::INT: - return capnp::DynamicValue::Reader(column->getInt(row_num)); - case capnp::DynamicValue::Type::UINT: - { - /// IPv4 column doesn't support getUInt method. - if (isIPv4(data_type)) - return capnp::DynamicValue::Reader(assert_cast(column.get())->getElement(row_num)); - return capnp::DynamicValue::Reader(column->getUInt(row_num)); - } - case capnp::DynamicValue::Type::BOOL: - return capnp::DynamicValue::Reader(column->getBool(row_num)); - case capnp::DynamicValue::Type::FLOAT: - return capnp::DynamicValue::Reader(column->getFloat64(row_num)); - case capnp::DynamicValue::Type::ENUM: - { - auto enum_schema = builder.as().getSchema(); - if (data_type->getTypeId() == TypeIndex::Enum8) - return capnp::DynamicValue::Reader( - getDynamicEnum(column, data_type, row_num, enum_schema, enum_comparing_mode)); - return capnp::DynamicValue::Reader( - getDynamicEnum(column, data_type, row_num, enum_schema, enum_comparing_mode)); - } - case capnp::DynamicValue::Type::DATA: - { - auto data = column->getDataAt(row_num); - return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); - } - case capnp::DynamicValue::Type::TEXT: - { - /// In TEXT type data should be null-terminated, but ClickHouse String data could not be. - /// To make data null-terminated we should copy it to temporary String object, but - /// capnp::Text::Reader works only with pointer to the data and it's size, so we should - /// guarantee that new String object life time is longer than capnp::Text::Reader life time. - /// To do this we store new String object in a temporary storage, passed in this function - /// by reference. We use unique_ptr instead of just String to avoid pointers - /// invalidation on vector reallocation. - temporary_text_data_storage.push_back(std::make_unique(column->getDataAt(row_num))); - auto & data = temporary_text_data_storage.back(); - return capnp::DynamicValue::Reader(capnp::Text::Reader(data->data(), data->size())); - } - case capnp::DynamicValue::Type::STRUCT: - { - auto struct_builder = builder.as(); - auto nested_struct_schema = struct_builder.getSchema(); - /// Struct can represent Tuple, Nullable (named union with two fields) or single column when it contains one nested column. - if (data_type->isNullable()) - { - const auto * nullable_type = assert_cast(data_type.get()); - const auto * nullable_column = assert_cast(column.get()); - auto fields = nested_struct_schema.getUnionFields(); - if (nullable_column->isNullAt(row_num)) - { - auto null_field = fields[0].getType().isVoid() ? fields[0] : fields[1]; - struct_builder.set(null_field, capnp::Void()); - } - else - { - auto value_field = fields[0].getType().isVoid() ? fields[1] : fields[0]; - struct_builder.clear(value_field); - const auto & nested_column = nullable_column->getNestedColumnPtr(); - auto value_builder = initStructFieldBuilder(nested_column, row_num, struct_builder, value_field); - auto value = convertToDynamicValue(nested_column, nullable_type->getNestedType(), row_num, column_name, value_builder, enum_comparing_mode, temporary_text_data_storage); - if (value) - struct_builder.set(value_field, *value); - } - } - else if (isTuple(data_type)) - { - const auto * tuple_data_type = assert_cast(data_type.get()); - const auto & nested_types = tuple_data_type->getElements(); - const auto & nested_names = tuple_data_type->getElementNames(); - const auto & nested_columns = assert_cast(column.get())->getColumns(); - bool have_explicit_names = tuple_data_type->haveExplicitNames(); - for (uint32_t i = 0; i != nested_names.size(); ++i) - { - capnp::StructSchema::Field nested_field = have_explicit_names ? nested_struct_schema.getFieldByName(nested_names[i]) : nested_struct_schema.getFields()[i]; - auto field_builder = initStructFieldBuilder(nested_columns[i], row_num, struct_builder, nested_field); - auto value = convertToDynamicValue(nested_columns[i], nested_types[i], row_num, nested_names[i], field_builder, enum_comparing_mode, temporary_text_data_storage); - if (value) - struct_builder.set(nested_field, *value); - } - } - else if (isMap(data_type)) - { - /// We output Map type as follow CapnProto schema - /// - /// struct Map { - /// struct Entry { - /// key @0: Key; - /// value @1: Value; - /// } - /// entries @0 :List(Entry); - /// } - /// - /// And we don't need to check that struct have this form here because we checked it before. - const auto & map_type = assert_cast(*data_type); - DataTypes key_value_types = {map_type.getKeyType(), map_type.getValueType()}; - Names key_value_names = {"key", "value"}; - auto entries_type = std::make_shared(std::make_shared(key_value_types, key_value_names)); - - /// Nested column in Map is actually Array(Tuple), so we can output it according to "entries" field schema. - const auto & entries_column = assert_cast(column.get())->getNestedColumnPtr(); - - auto entries_field = nested_struct_schema.getFields()[0]; - auto field_builder = initStructFieldBuilder(entries_column, row_num, struct_builder, entries_field); - auto entries_value = convertToDynamicValue(entries_column, entries_type, row_num, column_name, field_builder, enum_comparing_mode, temporary_text_data_storage); - if (entries_value) - struct_builder.set(entries_field, *entries_value); - } - else - { - /// It can be nested column from Nested type. - auto [field_name, nested_name] = splitCapnProtoFieldName(column_name); - auto nested_field = nested_struct_schema.getFieldByName(nested_name); - auto field_builder = initStructFieldBuilder(column, row_num, struct_builder, nested_field); - auto value = convertToDynamicValue(column, data_type, row_num, nested_name, field_builder, enum_comparing_mode, temporary_text_data_storage); - if (value) - struct_builder.set(nested_field, *value); - } - return std::nullopt; - } - case capnp::DynamicValue::Type::LIST: - { - auto list_builder = builder.as(); - const auto * array_column = assert_cast(column.get()); - const auto & nested_column = array_column->getDataPtr(); - const auto & nested_type = assert_cast(data_type.get())->getNestedType(); - const auto & offsets = array_column->getOffsets(); - auto offset = offsets[row_num - 1]; - size_t size = offsets[row_num] - offset; - - const auto * nested_array_column = checkAndGetColumn(*nested_column); - for (unsigned i = 0; i != static_cast(size); ++i) - { - capnp::DynamicValue::Builder value_builder; - /// For nested arrays we need to initialize nested list builder. - if (nested_array_column) - { - const auto & nested_offset = nested_array_column->getOffsets(); - size_t nested_array_size = nested_offset[offset + i] - nested_offset[offset + i - 1]; - value_builder = list_builder.init(i, static_cast(nested_array_size)); - } - else - value_builder = list_builder[i]; - - auto value = convertToDynamicValue(nested_column, nested_type, offset + i, column_name, value_builder, enum_comparing_mode, temporary_text_data_storage); - if (value) - list_builder.set(i, *value); - } - return std::nullopt; - } - default: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected CapnProto type."); - } + const auto & header = getPort(PortKind::Main).getHeader(); + serializer = std::make_unique(header.getDataTypes(), header.getNames(), schema, format_settings.capn_proto); + capnp::MallocMessageBuilder message; } void CapnProtoRowOutputFormat::write(const Columns & columns, size_t row_num) { capnp::MallocMessageBuilder message; - /// Temporary storage for data that will be outputted in fields with CapnProto type TEXT. - /// See comment in convertToDynamicValue() for more details. - std::vector> temporary_text_data_storage; capnp::DynamicStruct::Builder root = message.initRoot(schema); - - /// Some columns can share same field builder. For example when we have - /// column with Nested type that was flattened into several columns. - std::unordered_map field_builders; - for (size_t i = 0; i != columns.size(); ++i) - { - auto [struct_builder, field] = getStructBuilderAndFieldByColumnName(root, column_names[i]); - if (!field_builders.contains(field.getIndex())) - { - auto field_builder = initStructFieldBuilder(columns[i], row_num, struct_builder, field); - field_builders[field.getIndex()] = field_builder; - } - auto value = convertToDynamicValue(columns[i], column_types[i], row_num, column_names[i], field_builders[field.getIndex()], format_settings.capn_proto.enum_comparing_mode, temporary_text_data_storage); - if (value) - struct_builder.set(field, *value); - } - + serializer->writeRow(columns, std::move(root), row_num); capnp::writeMessage(*output_stream, message); + } void registerOutputFormatCapnProto(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.h b/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.h index 5cc7099d4c7..dd9dcc6b340 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.h +++ b/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.h @@ -3,15 +3,17 @@ #include "config.h" #if USE_CAPNP -#include -#include -#include -#include -#include -#include +# include +# include +# include +# include +# include +# include +# include namespace DB { + class CapnProtoOutputStream : public kj::OutputStream { public: @@ -43,8 +45,9 @@ private: DataTypes column_types; capnp::StructSchema schema; std::unique_ptr output_stream; - const FormatSettings format_settings; CapnProtoSchemaParser schema_parser; + std::unique_ptr serializer; + }; } diff --git a/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp b/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp index 9777f2361a2..6098923a195 100644 --- a/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp @@ -88,7 +88,14 @@ void registerInputFormatProtobufList(FormatFactory & factory) }); factory.markFormatSupportsSubsetOfColumns("ProtobufList"); factory.registerAdditionalInfoForSchemaCacheGetter( - "ProtobufList", [](const FormatSettings & settings) { return fmt::format("format_schema={}", settings.schema.format_schema); }); + "ProtobufList", + [](const FormatSettings & settings) + { + return fmt::format( + "format_schema={}, skip_fields_with_unsupported_types_in_schema_inference={}", + settings.schema.format_schema, + settings.protobuf.skip_fields_with_unsupported_types_in_schema_inference); + }); } void registerProtobufListSchemaReader(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp index ee60501dba5..126f3673571 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp @@ -128,7 +128,14 @@ void registerProtobufSchemaReader(FormatFactory & factory) for (const auto & name : {"Protobuf", "ProtobufSingle"}) factory.registerAdditionalInfoForSchemaCacheGetter( - name, [](const FormatSettings & settings) { return fmt::format("format_schema={}", settings.schema.format_schema); }); + name, + [](const FormatSettings & settings) + { + return fmt::format( + "format_schema={}, skip_fields_with_unsupported_types_in_schema_inference={}", + settings.schema.format_schema, + settings.protobuf.skip_fields_with_unsupported_types_in_schema_inference); + }); } } diff --git a/tests/queries/0_stateless/02030_capnp_format.sh b/tests/queries/0_stateless/02030_capnp_format.sh index c15d6fe442e..625104fb590 100755 --- a/tests/queries/0_stateless/02030_capnp_format.sh +++ b/tests/queries/0_stateless/02030_capnp_format.sh @@ -96,8 +96,8 @@ $CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a_b U $CLICKHOUSE_CLIENT --query="SELECT number AS a_b, number + 1 AS a_c_d, number + 2 AS a_c_e_f FROM numbers(5) FORMAT CapnProto SETTINGS format_schema='$CLIENT_SCHEMADIR/02030_capnp_nested_tuples:Message'" > $CAPN_PROTO_FILE $CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a Tuple(b UInt64, c Tuple(d UInt64, e Tuple(f UInt64)))') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" -$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a Tuple(bb UInt64, c Tuple(d UInt64, e Tuple(f UInt64)))') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a Tuple(b UInt64, c Tuple(d UInt64, e Tuple(ff UInt64)))') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a Tuple(bb UInt64, c Tuple(d UInt64, e Tuple(f UInt64)))') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" 2>&1 | grep -F -q "THERE_IS_NO_COLUMN" && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a Tuple(b UInt64, c Tuple(d UInt64, e Tuple(ff UInt64)))') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" 2>&1 | grep -F -q "THERE_IS_NO_COLUMN" && echo 'OK' || echo 'FAIL'; $CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'string String') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_simple_types:Message'" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL'; diff --git a/tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.reference b/tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.reference new file mode 100644 index 00000000000..f34c857e2f6 --- /dev/null +++ b/tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.reference @@ -0,0 +1 @@ +42 (42,42) diff --git a/tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.sh b/tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.sh new file mode 100755 index 00000000000..c3835948437 --- /dev/null +++ b/tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-parallel, no-replicated-database + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +SCHEMADIR=$CURDIR/format_schemas +$CLICKHOUSE_LOCAL -q "select 42 as Field1, (42, 42)::Tuple(Field1 UInt32, Field2 UInt32) as Nested format CapnProto settings format_schema='$SCHEMADIR/02735_case_insensitive_names_matching:Message'" | $CLICKHOUSE_LOCAL --input-format CapnProto --structure "Field1 UInt32, Nested Tuple(Field1 UInt32, Field2 UInt32)" -q "select * from table" --format_schema="$SCHEMADIR/02735_case_insensitive_names_matching:Message" + diff --git a/tests/queries/0_stateless/02736_reading_and_writing_structure_fields.reference b/tests/queries/0_stateless/02736_reading_and_writing_structure_fields.reference new file mode 100644 index 00000000000..b6e6d485929 --- /dev/null +++ b/tests/queries/0_stateless/02736_reading_and_writing_structure_fields.reference @@ -0,0 +1,3 @@ +(42,(42,42),[(42,42),(24,24)]) [(42,(42,42),[(42,42),(24,24)]),(24,(24,24),[(24,24),(42,42)])] +42 42 42 +[42,24] [42,24] [42,24] [[42,24],[24,42]] [[42,24],[24,42]] diff --git a/tests/queries/0_stateless/02736_reading_and_writing_structure_fields.sh b/tests/queries/0_stateless/02736_reading_and_writing_structure_fields.sh new file mode 100755 index 00000000000..c669be2ed33 --- /dev/null +++ b/tests/queries/0_stateless/02736_reading_and_writing_structure_fields.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-parallel, no-replicated-database + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +SCHEMADIR=$CURDIR/format_schemas +DATA_FILE=02736_$CLICKHOUSE_TEST_UNIQUE_NAME.bin + +$CLICKHOUSE_LOCAL -q "select tuple(42, tuple(42, 42), [tuple(42, 42), tuple(24, 24)]) as nested, [tuple(42, tuple(42, 42), [tuple(42, 42), tuple(24, 24)]), tuple(24, tuple(24, 24), [tuple(24, 24), tuple(42, 42)])] as nestedList format CapnProto settings format_schema='$SCHEMADIR/02736_nested_structures:Message'" > $DATA_FILE + +$CLICKHOUSE_LOCAL -q "select * from file($DATA_FILE, CapnProto) settings format_schema='$SCHEMADIR/02736_nested_structures:Message'" + +$CLICKHOUSE_LOCAL -q "select 42 as nested_field1, 42 as nested_nested_field1, 42 as nested_nested_field2 format CapnProto settings format_schema='$SCHEMADIR/02736_nested_structures:Message'" > $DATA_FILE + +$CLICKHOUSE_LOCAL -q "select * from file($DATA_FILE, CapnProto, 'nested_field1 UInt32, nested_nested_field1 UInt32, nested_nested_field2 UInt32') settings format_schema='$SCHEMADIR/02736_nested_structures:Message'" + +$CLICKHOUSE_LOCAL -q "select [42, 24] as nestedList_field1, [42, 24] as nestedList_nested_field1, [42, 24] as nestedList_nested_field2, [[42, 24], [24, 42]] as nestedList_nestedList_field1, [[42, 24], [24, 42]] as nestedList_nestedList_field2 format CapnProto settings format_schema='$SCHEMADIR/02736_nested_structures:Message'" > $DATA_FILE + +$CLICKHOUSE_LOCAL -q "select * from file($DATA_FILE, CapnProto, 'nestedList_field1 Array(UInt32), nestedList_nested_field1 Array(UInt32), nestedList_nested_field2 Array(UInt32), nestedList_nestedList_field1 Array(Array(UInt32)), nestedList_nestedList_field2 Array(Array(UInt32))') settings format_schema='$SCHEMADIR/02736_nested_structures:Message'" + +rm $DATA_FILE + diff --git a/tests/queries/0_stateless/format_schemas/02735_case_insensitive_names_matching.capnp b/tests/queries/0_stateless/format_schemas/02735_case_insensitive_names_matching.capnp new file mode 100644 index 00000000000..6b12aab081a --- /dev/null +++ b/tests/queries/0_stateless/format_schemas/02735_case_insensitive_names_matching.capnp @@ -0,0 +1,13 @@ +@0x9ef128e10a8010b8; + +struct Nested +{ + field1 @0 : UInt32; + field2 @1 : UInt32; +} + +struct Message +{ + field1 @0 : UInt32; + nested @1 : Nested; +} diff --git a/tests/queries/0_stateless/format_schemas/02736_nested_structures.capnp b/tests/queries/0_stateless/format_schemas/02736_nested_structures.capnp new file mode 100644 index 00000000000..a03eb27f383 --- /dev/null +++ b/tests/queries/0_stateless/format_schemas/02736_nested_structures.capnp @@ -0,0 +1,21 @@ +@0x9ef128e10a8010b8; + +struct Nested2 +{ + field1 @0 : UInt32; + field2 @1 : UInt32; +} + +struct Nested +{ + field1 @0 : UInt32; + nested @1 : Nested2; + nestedList @2 : List(Nested2); +} + +struct Message +{ + nested @0 : Nested; + nestedList @1 : List(Nested); +} + From c2eada7ba7bd385281c140dbf225be7eee4f1ff2 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 10 May 2023 21:07:56 +0200 Subject: [PATCH 0125/1072] Fix style --- src/Formats/CapnProtoSerializer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp index e0c8ae2a79a..c31623286d0 100644 --- a/src/Formats/CapnProtoSerializer.cpp +++ b/src/Formats/CapnProtoSerializer.cpp @@ -26,7 +26,7 @@ namespace DB namespace ErrorCodes { extern const int THERE_IS_NO_COLUMN; - extern const int BAD_TYPE_OF_FIELD; + extern const int LOGICAL_ERROR; extern const int CAPN_PROTO_BAD_CAST; extern const int INCORRECT_DATA; extern const int ILLEGAL_COLUMN; @@ -293,7 +293,7 @@ namespace return capnp::DynamicValue::Reader(capnp::DynamicEnum(enumerant)); } - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot convert CLickHouse Enum value to CapnProto Enum"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot convert ClickHouse Enum value to CapnProto Enum"); } void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override From cc7cfa050f5723fa4bfeca994a04784732950968 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 10 May 2023 21:08:12 +0200 Subject: [PATCH 0126/1072] Fix style --- src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp index e686ae86997..c056ee2b4a4 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp @@ -14,7 +14,6 @@ namespace DB namespace ErrorCodes { - extern const int LOGICAL_ERROR; extern const int INCORRECT_DATA; } From 1347dc4ede100dbfc7240fa7ead23b13c924d202 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 10 May 2023 21:08:31 +0200 Subject: [PATCH 0127/1072] Fix style --- src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp b/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp index 7dd18be27f4..66a7160dd89 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/CapnProtoRowOutputFormat.cpp @@ -11,12 +11,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - - CapnProtoOutputStream::CapnProtoOutputStream(WriteBuffer & out_) : out(out_) { } From a89a8b8d50f8ff5c05ebbcdb83f19dcac6739dbf Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 11 May 2023 12:08:50 +0000 Subject: [PATCH 0128/1072] Fix build --- src/Formats/CapnProtoSerializer.cpp | 6 ++++++ src/Formats/CapnProtoSerializer.h | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp index c31623286d0..00ccfc7717d 100644 --- a/src/Formats/CapnProtoSerializer.cpp +++ b/src/Formats/CapnProtoSerializer.cpp @@ -1,3 +1,7 @@ +#include "config.h" + +#if USE_CAPNP + #include #include #include @@ -1216,3 +1220,5 @@ void CapnProtoSerializer::readRow(MutableColumns & columns, capnp::DynamicStruct CapnProtoSerializer::~CapnProtoSerializer() = default; } + +#endif diff --git a/src/Formats/CapnProtoSerializer.h b/src/Formats/CapnProtoSerializer.h index efae797875b..692f5e5301f 100644 --- a/src/Formats/CapnProtoSerializer.h +++ b/src/Formats/CapnProtoSerializer.h @@ -1,5 +1,7 @@ #pragma once +#if USE_CAPNP + #include #include @@ -23,3 +25,5 @@ private: }; } + +#endif From 5f1ca61d090b70ecee8f70d8e3656195e13f0ee9 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 12 May 2023 16:12:01 +0200 Subject: [PATCH 0129/1072] Fix special builds --- src/Formats/CapnProtoSerializer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp index 00ccfc7717d..091e70da656 100644 --- a/src/Formats/CapnProtoSerializer.cpp +++ b/src/Formats/CapnProtoSerializer.cpp @@ -1007,7 +1007,7 @@ namespace catch (Exception & e) { e.addMessage("(while converting column {})", column_name); - throw e; + throw std::move(e); } } @@ -1015,7 +1015,7 @@ namespace { assert(builder); auto & struct_builder = assert_cast(*builder); - if (auto tuple_column = typeid_cast(column.get())) + if (auto * tuple_column = typeid_cast(column.get())) writeRow(tuple_column->getColumnsCopy(), struct_builder, row_num); else writeRow(Columns{column}, struct_builder, row_num); From 94ef08977ae88a48f95343a7b27abed6471efbe6 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 12 May 2023 18:53:51 +0200 Subject: [PATCH 0130/1072] Fix special build --- src/Formats/CapnProtoSchema.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Formats/CapnProtoSchema.cpp b/src/Formats/CapnProtoSchema.cpp index 22518d5061a..f9ab88d39ed 100644 --- a/src/Formats/CapnProtoSchema.cpp +++ b/src/Formats/CapnProtoSchema.cpp @@ -151,7 +151,7 @@ namespace { template - static DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants) + DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants) { std::vector> values; for (auto enumerant : enumerants) @@ -159,7 +159,7 @@ namespace return std::make_shared>(std::move(values)); } - static DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema) + DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema) { auto enumerants = enum_schema.getEnumerants(); if (enumerants.size() < 128) @@ -170,7 +170,7 @@ namespace throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "ClickHouse supports only 8 and 16-bit Enums"); } - static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type, bool skip_unsupported_fields) + DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type, bool skip_unsupported_fields) { switch (capnp_type.which()) { From f76fc5e06682fb7931fc067bcbc38960e91dea7b Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 12 May 2023 18:54:38 +0200 Subject: [PATCH 0131/1072] Fix special build --- src/Formats/CapnProtoSerializer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp index 091e70da656..ff3880976c7 100644 --- a/src/Formats/CapnProtoSerializer.cpp +++ b/src/Formats/CapnProtoSerializer.cpp @@ -158,7 +158,7 @@ namespace }; template - static std::unique_ptr createIntegerSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + std::unique_ptr createIntegerSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) { switch (capnp_type.which()) { @@ -1015,7 +1015,7 @@ namespace { assert(builder); auto & struct_builder = assert_cast(*builder); - if (auto * tuple_column = typeid_cast(column.get())) + if (const auto * tuple_column = typeid_cast(column.get())) writeRow(tuple_column->getColumnsCopy(), struct_builder, row_num); else writeRow(Columns{column}, struct_builder, row_num); From 75791d7a63b9a6a579e1f036cdffc321bcc9fa2d Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Thu, 25 May 2023 07:51:32 +0000 Subject: [PATCH 0132/1072] Added input_format_csv_trim_whitespaces parameter --- docs/en/interfaces/formats.md | 1 + .../operations/settings/settings-formats.md | 32 +++++ docs/ru/interfaces/formats.md | 17 +++ docs/ru/operations/settings/settings.md | 62 +++++++++ src/Core/Settings.h | 1 + src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + src/IO/ReadHelpers.cpp | 19 +-- .../Formats/Impl/CSVRowInputFormat.cpp | 17 ++- .../02764_csv_trim_whitespaces.reference | 122 ++++++++++++++++++ .../0_stateless/02764_csv_trim_whitespaces.sh | 55 ++++++++ 11 files changed, 317 insertions(+), 11 deletions(-) create mode 100644 tests/queries/0_stateless/02764_csv_trim_whitespaces.reference create mode 100755 tests/queries/0_stateless/02764_csv_trim_whitespaces.sh diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 2ab9e8caec4..f19fd94dcd8 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -467,6 +467,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe - [output_format_csv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#output_format_csv_crlf_end_of_line) - if it is set to true, end of line in CSV output format will be `\r\n` instead of `\n`. Default value - `false`. - [input_format_csv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_first_lines) - skip the specified number of lines at the beginning of data. Default value - `0`. - [input_format_csv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_csv_detect_header) - automatically detect header with names and types in CSV format. Default value - `true`. +- [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`. ## CSVWithNames {#csvwithnames} diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 3b87b829c92..cb7d98a4876 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -882,6 +882,38 @@ My NULL My NULL ``` +### input_format_csv_trim_whitespaces {#input_format_csv_trim_whitespaces} + +Trims spaces and tabs in non-quoted CSV strings. + +Default value: `true`. + +**Examples** + +Query + +```bash +echo ' string ' |./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_trim_whitespaces=true +``` + +Result + +```text +"string" +``` + +Query + +```bash +echo ' string ' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_trim_whitespaces=false +``` + +Result + +```text +" string " +``` + ## Values format settings {#values-format-settings} ### input_format_values_interpret_expressions {#input_format_values_interpret_expressions} diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index bef5c223281..48a6132170a 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -387,6 +387,23 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR Формат CSV поддерживает вывод totals и extremes аналогично `TabSeparated`. + +### CSV опции форматирования {#csv-format-settings} + +- [format_csv_delimiter](../operations/settings/settings.md#format_csv_delimiter) - символ, который будет считаться разделителем в CSV данных. Значение по умолчанию - `,`. +- [format_csv_allow_single_quotes](../operations/settings/settings.md#format_csv_allow_single_quotes) - разрешить строки в одинарных кавычках. Значение по умолчанию - `true`. +- [format_csv_allow_double_quotes](../operations/settings/settings.md#format_csv_allow_double_quotes) - разрешить строки в двойных кавычках. Значение по умолчанию - `true`. +- [format_csv_null_representation](../operations/settings/settings.md#format_tsv_null_representation) - пользовательское представление NULL в формате CSV. Значение по умолчанию - `\N`. +- [input_format_csv_empty_as_default](../operations/settings/settings.md#input_format_csv_empty_as_default) - рассматривать пустые поля в CSV в качестве значений по умолчанию. Значение по умолчанию - `true`. Для сложных выражений по умолчанию необходимо также включить [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#input_format_defaults_for_omitted_fields). +- [input_format_csv_enum_as_number](../operations/settings/settings.md#input_format_csv_enum_as_number) - рассматривать вставленные значения enum в форматах CSV как индексы enum. Значение по умолчанию - `false`. +- [input_format_csv_use_best_effort_in_schema_inference](../operations/settings/settings.md#input_format_csv_use_best_effort_in_schema_inference) - использовать некоторые твики и эвристики для вывода схемы в формате CSV. Если параметр отключен, все поля будут определяться как строки. Значение по умолчанию - `true`. +- [input_format_csv_arrays_as_nested_csv](../operations/settings/settings.md#input_format_csv_arrays_as_nested_csv) - при чтении массива из CSV ожидать, что его элементы были сериализованы во вложенный CSV и затем помещены в строку. Значение по умолчанию - `false`. +- [output_format_csv_crlf_end_of_line](../operations/settings/settings.md#output_format_csv_crlf_end_of_line) - если установлено значение true, конец строки в формате вывода CSV будет `\r\n` вместо `\n`. Значение по умолчанию - `false`. +- [input_format_csv_skip_first_lines](../operations/settings/settings.md#input_format_csv_skip_first_lines) - пропустить указанное количество строк в начале данных. Значение по умолчанию - `0`. +- [input_format_csv_detect_header](../operations/settings/settings.md#input_format_csv_detect_header) - обнаружить заголовок с именами и типами в формате CSV. Значение по умолчанию - `true`. +- [input_format_csv_trim_whitespaces](../operations/settings/settings.md#input_format_csv_trim_whitespaces) - удалить пробелы и символы табуляции из строк без кавычек. +Значение по умолчанию - `true`. + ## CSVWithNames {#csvwithnames} Выводит также заголовок, аналогично [TabSeparatedWithNames](#tabseparatedwithnames). diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index fa3ea582c55..e9b7091c8b8 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -1589,6 +1589,24 @@ SELECT area/period FROM account_orders FORMAT JSON; Символ, интерпретируемый как разделитель в данных формата CSV. По умолчанию — `,`. +## format_csv_allow_double_quotes {#format_csv_allow_double_quotes} + +Если установлено значение true, разрешить строки в двойных кавычках. + +Включено по умолчанию. + +## input_format_csv_empty_as_default {#input_format_csv_empty_as_default} + +Если включено, заменяет пустые поля ввода в CSV значениями по умолчанию. Для сложных выражений по умолчанию `input_format_defaults_for_omitted_fields` также должен быть включен. + +Включено по умолчанию. + +## input_format_csv_arrays_as_nested_csv {#input_format_csv_arrays_as_nested_csv} + +При чтении массива из CSV ожидайте, что его элементы были сериализованы во вложенный CSV, а затем помещены в строку. Пример: "[""Hello"", ""world"", ""42"""" TV""]". Скобки вокруг массива могут быть опущены. + +По умолчанию отключены. + ## input_format_csv_unquoted_null_literal_as_null {#settings-input_format_csv_unquoted_null_literal_as_null} Для формата CSV включает или выключает парсинг неэкранированной строки `NULL` как литерала (синоним для `\N`) @@ -1665,6 +1683,50 @@ SELECT * FROM table_with_enum_column_for_csv_insert; Использовать в качестве разделителя строк для CSV формата CRLF (DOS/Windows стиль) вместо LF (Unix стиль). +## input_format_csv_detect_header {#input_format_csv_detect_header} + +Обнаружить заголовок с именами и типами в формате CSV. + +Значение по умолчанию - `true`. + +## input_format_csv_skip_first_lines {#input_format_csv_skip_first_lines} + +Количество строк, пропускаемых в начале данных в формате ввода CSV. + +Значение по умолчанию: `0`. + +## input_format_csv_trim_whitespaces {#input_format_csv_trim_whitespaces} + +Удалить пробелы и символы табуляции из строк без кавычек. + +Значение по умолчанию: `true`. + +**Примеры** + +Запрос + +```bash +echo ' string ' |./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_trim_whitespaces=true +``` + +Результат + +```text +"string" +``` + +Запрос + +```bash +echo ' string ' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_trim_whitespaces=false +``` + +Результат + +```text +" string " +``` + ## output_format_tsv_crlf_end_of_line {#settings-output-format-tsv-crlf-end-of-line} Использовать в качестве разделителя строк для TSV формата CRLF (DOC/Windows стиль) вместо LF (Unix стиль). diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 1df0a8af24f..750b6e16c4b 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -844,6 +844,7 @@ class IColumn; M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \ M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \ M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \ + M(Bool, input_format_csv_trim_whitespaces, true, "Trims spaces and tabs (\\t) characters at the beginning and end in CSV strings", 0) \ M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \ M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \ M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Parquet", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index dd6252b96f1..ada8751545c 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -69,6 +69,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference; format_settings.csv.skip_first_lines = settings.input_format_csv_skip_first_lines; format_settings.csv.try_detect_header = settings.input_format_csv_detect_header; + format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces; format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter; format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter; format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index c88af650671..3ae579cd552 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -136,6 +136,7 @@ struct FormatSettings UInt64 skip_first_lines = 0; String custom_delimiter; bool try_detect_header = true; + bool trim_whitespaces = true; } csv; struct HiveText diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 99d25ee6613..8dc05e75855 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -849,15 +849,18 @@ void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & if constexpr (WithResize) { - /** CSV format can contain insignificant spaces and tabs. - * Usually the task of skipping them is for the calling code. - * But in this case, it will be difficult to do this, so remove the trailing whitespace by ourself. - */ - size_t size = s.size(); - while (size > 0 && (s[size - 1] == ' ' || s[size - 1] == '\t')) - --size; + if (settings.trim_whitespaces) [[likely]] + { + /** CSV format can contain insignificant spaces and tabs. + * Usually the task of skipping them is for the calling code. + * But in this case, it will be difficult to do this, so remove the trailing whitespace by ourself. + */ + size_t size = s.size(); + while (size > 0 && (s[size - 1] == ' ' || s[size - 1] == '\t')) + --size; - s.resize(size); + s.resize(size); + } } return; } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index de955d81651..9922bd41442 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace DB @@ -152,7 +153,9 @@ void CSVFormatReader::skipFieldDelimiter() template String CSVFormatReader::readCSVFieldIntoString() { - skipWhitespacesAndTabs(*buf); + if (format_settings.csv.trim_whitespaces) [[likely]] + skipWhitespacesAndTabs(*buf); + String field; if constexpr (read_string) readCSVString(field, *buf, format_settings.csv); @@ -200,7 +203,6 @@ void CSVFormatReader::skipHeaderRow() template std::vector CSVFormatReader::readRowImpl() { - std::vector fields; do { @@ -280,7 +282,16 @@ bool CSVFormatReader::readField( bool is_last_file_column, const String & /*column_name*/) { - skipWhitespacesAndTabs(*buf); + if (format_settings.csv.trim_whitespaces) [[likely]] + skipWhitespacesAndTabs(*buf); + else if (type->isNullable()) + { + auto nested_type = typeid_cast(type.get())->getNestedType(); + if (!isStringOrFixedString(nested_type)) + skipWhitespacesAndTabs(*buf); + } + else if (!isStringOrFixedString(type)) + skipWhitespacesAndTabs(*buf); const bool at_delimiter = !buf->eof() && *buf->position() == format_settings.csv.delimiter; const bool at_last_column_line_end = is_last_file_column && (buf->eof() || *buf->position() == '\n' || *buf->position() == '\r'); diff --git a/tests/queries/0_stateless/02764_csv_trim_whitespaces.reference b/tests/queries/0_stateless/02764_csv_trim_whitespaces.reference new file mode 100644 index 00000000000..a2ea31ddae7 --- /dev/null +++ b/tests/queries/0_stateless/02764_csv_trim_whitespaces.reference @@ -0,0 +1,122 @@ +" trim_false_tab_left" +"trim_false_tab_right " +"trim_false_ tab_middle" +" trim_false_ tab_everywhere " +" trim_false_fixed_string_ tab_everywhere " +" quoted_trim_false_ tab_everywhere " +" trim_false_csv_field1 ","123 ","5.0 "," 12.0123"," quoted_string1" +" trim_false_csv_field2 "," 321 "," 0.5","21.321 "," quoted_ string2 " +" trim_false_csv_field1_with_ structure ",123,5,12.0123," quoted_string " +" trim_false_csv_field2_with_structure ",321,0.5,21.321," quoted_ _string2 " +" trim_false_space_left" +"trim_false_space_right " +"trim_false_ space_middle" +" trim_false_ space_everywhere " +" trim_false_fixed_string_ space_everywhere " +" quoted_trim_false_ space_everywhere " +" trim_false_csv_field1 ","123 ","5.0 "," 12.0123"," quoted_string1" +" trim_false_csv_field2 "," 321 "," 0.5","21.321 "," quoted_ string2 " +" trim_false_csv_field1_with_ structure ",123,5,12.0123," quoted_string " +" trim_false_csv_field2_with_structure ",321,0.5,21.321," quoted_ _string2 " +" trim_false_tab_space_left" +"trim_false_tab_space_right " +"trim_false_ tab_space_middle" +" trim_false_ tab_space_everywhere " +" trim_false_fixed_string_ tab_space_everywhere " +" quoted_trim_false_ tab_space_everywhere " +" trim_false_csv_field1 ","123 ","5.0 "," 12.0123"," quoted_string1" +" trim_false_csv_field2 "," 321 "," 0.5","21.321 "," quoted_ string2 " +" trim_false_csv_field1_with_ structure ",123,5,12.0123," quoted_string " +" trim_false_csv_field2_with_structure ",321,0.5,21.321," quoted_ _string2 " +8 +8 +16 +16 +32.32 +32.32 +64.64 +64.64 +"2023-05-22" +"2023-05-22" +"2023-05-22" +"2023-05-22" +"2023-05-22 00:00:00" +"2023-05-22 00:00:00" +"2023-05-22 00:00:00.000" +"2023-05-22 00:00:00.000" +"trim_true_tab_left" +"trim_true_tab_right" +"trim_true_ tab_middle" +"trim_true_ tab_everywhere" +"trim_true_fixed_string_ tab_everywhere" +" quoted_trim_true_ tab_everywhere " +"trim_true_csv_field1",123,5,12.0123," quoted_string1" +"trim_true_csv_field2",321,0.5,21.321," quoted_ string2 " +"trim_true_csv_field1_with_ structure",123,5,12.0123," quoted_string " +"trim_true_csv_field2_with_structure",321,0.5,21.321," quoted_ _string2 " +"trim_true_space_left" +"trim_true_space_right" +"trim_true_ space_middle" +"trim_true_ space_everywhere" +"trim_true_fixed_string_ space_everywhere" +" quoted_trim_true_ space_everywhere " +"trim_true_csv_field1",123,5,12.0123," quoted_string1" +"trim_true_csv_field2",321,0.5,21.321," quoted_ string2 " +"trim_true_csv_field1_with_ structure",123,5,12.0123," quoted_string " +"trim_true_csv_field2_with_structure",321,0.5,21.321," quoted_ _string2 " +"trim_true_tab_space_left" +"trim_true_tab_space_right" +"trim_true_ tab_space_middle" +"trim_true_ tab_space_everywhere" +"trim_true_fixed_string_ tab_space_everywhere" +" quoted_trim_true_ tab_space_everywhere " +"trim_true_csv_field1",123,5,12.0123," quoted_string1" +"trim_true_csv_field2",321,0.5,21.321," quoted_ string2 " +"trim_true_csv_field1_with_ structure",123,5,12.0123," quoted_string " +"trim_true_csv_field2_with_structure",321,0.5,21.321," quoted_ _string2 " +8 +8 +16 +16 +32.32 +32.32 +64.64 +64.64 +"2023-05-22" +"2023-05-22" +"2023-05-22" +"2023-05-22" +"2023-05-22 00:00:00" +"2023-05-22 00:00:00" +"2023-05-22 00:00:00.000" +"2023-05-22 00:00:00.000" +" custom_csv_tab_left" +"custom_csv_tab_right " +"custom_csv_ tab_middle" +" custom_csv_ tab_everywhere " +" custom_csv_fixed_string_ tab_everywhere " +" quoted_custom_csv_ tab_everywhere " +" custom_csv_field_with_ structure ",123,5,12.0123," custom_csv_quoted_string " +" custom_csv_field2_with_structure ",321,0.5,21.321," custom_csv_quoted_ _string2 " +" custom_csv_field_with_ structure ",123,5,12.0123," custom_csv_quoted_string " +" custom_csv_field2_with_structure ",321,0.5,21.321," custom_csv_quoted_ _string2 " +" custom_csv_space_left" +"custom_csv_space_right " +"custom_csv_ space_middle" +" custom_csv_ space_everywhere " +" custom_csv_fixed_string_ space_everywhere " +" quoted_custom_csv_ space_everywhere " +" custom_csv_field_with_ structure ",123,5,12.0123," custom_csv_quoted_string " +" custom_csv_field2_with_structure ",321,0.5,21.321," custom_csv_quoted_ _string2 " +" custom_csv_field_with_ structure ",123,5,12.0123," custom_csv_quoted_string " +" custom_csv_field2_with_structure ",321,0.5,21.321," custom_csv_quoted_ _string2 " +" custom_csv_tab_space_left" +"custom_csv_tab_space_right " +"custom_csv_ tab_space_middle" +" custom_csv_ tab_space_everywhere " +" custom_csv_fixed_string_ tab_space_everywhere " +" quoted_custom_csv_ tab_space_everywhere " +" custom_csv_field_with_ structure ",123,5,12.0123," custom_csv_quoted_string " +" custom_csv_field2_with_structure ",321,0.5,21.321," custom_csv_quoted_ _string2 " +" custom_csv_field_with_ structure ",123,5,12.0123," custom_csv_quoted_string " +" custom_csv_field2_with_structure ",321,0.5,21.321," custom_csv_quoted_ _string2 " diff --git a/tests/queries/0_stateless/02764_csv_trim_whitespaces.sh b/tests/queries/0_stateless/02764_csv_trim_whitespaces.sh new file mode 100755 index 00000000000..9b3b7231f6e --- /dev/null +++ b/tests/queries/0_stateless/02764_csv_trim_whitespaces.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +BOOLS=('false' 'true') + +WHITESPACES=( '\t' ' ' '\t ') +WHITESPACES_NAMES=('tab' 'space' 'tab_space') + +DATA_TYPES=( 'Int8' 'Int16' 'Float32' 'Float64' 'Date' 'Date32' 'DateTime' 'DateTime64') +DATA_VALUES=( '8' '16' '32.32' '64.64' '2023-05-22' '2023-05-22' '2023-05-22 00:00:00' '2023-05-22 00:00:00.000') + +for trim in "${BOOLS[@]}" +do + for wsIndex in "${!WHITESPACES[@]}"; + do + whitespace=${WHITESPACES[$wsIndex]} + whitespace_name=${WHITESPACES_NAMES[$wsIndex]} + echo -e "${whitespace}trim_${trim}_${whitespace_name}_left" | $CLICKHOUSE_LOCAL --input_format_csv_trim_whitespaces=${trim} --input-format="CSV" -q "select * from table FORMAT CSV" + echo -e "trim_${trim}_${whitespace_name}_right${whitespace}" | $CLICKHOUSE_LOCAL --input_format_csv_trim_whitespaces=${trim} --input-format="CSV" -q "select * from table FORMAT CSV" + echo -e "trim_${trim}_${whitespace}${whitespace_name}_middle" | $CLICKHOUSE_LOCAL --input_format_csv_trim_whitespaces=${trim} --input-format="CSV" -q "select * from table FORMAT CSV" + echo -e "${whitespace}trim_${trim}_${whitespace}${whitespace_name}_everywhere${whitespace}" | $CLICKHOUSE_LOCAL --input_format_csv_trim_whitespaces=${trim} --input-format="CSV" -q "select * from table FORMAT CSV" + echo -e "${whitespace}trim_${trim}_fixed_string_${whitespace}${whitespace_name}_everywhere${whitespace}" | $CLICKHOUSE_LOCAL -S "c1 FixedString(64)" --input_format_csv_trim_whitespaces=${trim} --input-format="CSV" -q "select toString(c1) from table FORMAT CSV" + echo -e "\"${whitespace}quoted_trim_${trim}_${whitespace}${whitespace_name}_everywhere${whitespace}\"" | $CLICKHOUSE_LOCAL --input_format_csv_trim_whitespaces=${trim} --input-format="CSV" -q "select * from table FORMAT CSV" + echo -e "${whitespace}trim_${trim}_csv_field1${whitespace},123${whitespace},5.0${whitespace},${whitespace}12.0123,\"${whitespace}quoted_string1\"\n${whitespace}trim_${trim}_csv_field2${whitespace},${whitespace}321${whitespace},${whitespace}0.5,21.321${whitespace},\"${whitespace}quoted_${whitespace}string2${whitespace}\"${whitespace}" | $CLICKHOUSE_LOCAL --input_format_csv_trim_whitespaces=${trim} --input-format="CSV" -q "select * from table FORMAT CSV" + echo -e "${whitespace}trim_${trim}_csv_field1_with_${whitespace}structure${whitespace},${whitespace}123,${whitespace}5.0${whitespace},12.0123${whitespace},\"${whitespace}quoted_string${whitespace}\"\n${whitespace}trim_${trim}_csv_field2_with_structure${whitespace},${whitespace}321${whitespace},0.5,21.321,\"${whitespace}quoted_${whitespace}_string2${whitespace}\"${whitespace}" | $CLICKHOUSE_LOCAL -S "c1 String, c2 Int32, c3 Float, c4 Double, c5 String" --input_format_csv_trim_whitespaces=${trim} --input-format="CSV" -q "select * from table FORMAT CSV" + done + + for type_index in "${!DATA_TYPES[@]}"; + do + type=${DATA_TYPES[$type_index]} + value=${DATA_VALUES[$type_index]} + echo -e "\t ${value} \t" | $CLICKHOUSE_LOCAL -S "c1 ${type}" --input-format="CSV" --input_format_csv_trim_whitespaces=${trim} -q "select * from table FORMAT CSV" + echo -e "\t ${value} \t" | $CLICKHOUSE_LOCAL -S "c1 Nullable(${type})" --input-format="CSV" --input_format_csv_trim_whitespaces=${trim} -q "select * from table FORMAT CSV" + done +done + +## Custom CSV tested with input_format_csv_trim_whitespaces = false. +## Custom CSV with input_format_csv_trim_whitespaces=true doesn't trim whitespaces from the left side at the moment +for wsIndex in "${!WHITESPACES[@]}"; +do + whitespace=${WHITESPACES[$wsIndex]} + whitespace_name=${WHITESPACES_NAMES[$wsIndex]} + echo -e "${whitespace}custom_csv_${whitespace_name}_left" | $CLICKHOUSE_LOCAL --input-format="CustomSeparated" --input_format_csv_trim_whitespaces=false --format_custom_escaping_rule=CSV --format_custom_field_delimiter=',' --format_csv_delimiter=',' -q "select * from table FORMAT CSV" + echo -e "custom_csv_${whitespace_name}_right${whitespace}" | $CLICKHOUSE_LOCAL --input-format="CustomSeparated" --input_format_csv_trim_whitespaces=false --format_custom_escaping_rule=CSV --format_custom_field_delimiter=',' --format_csv_delimiter=',' -q "select * from table FORMAT CSV" + echo -e "custom_csv_${whitespace}${whitespace_name}_middle" | $CLICKHOUSE_LOCAL --input-format="CustomSeparated" --input_format_csv_trim_whitespaces=false --format_custom_escaping_rule=CSV --format_custom_field_delimiter=',' --format_csv_delimiter=',' -q "select * from table FORMAT CSV" + echo -e "${whitespace}custom_csv_${whitespace}${whitespace_name}_everywhere${whitespace}" | $CLICKHOUSE_LOCAL --input-format="CustomSeparated" --input_format_csv_trim_whitespaces=false --format_custom_escaping_rule=CSV --format_custom_field_delimiter=',' --format_csv_delimiter=',' -q "select * from table FORMAT CSV" + echo -e "${whitespace}custom_csv_fixed_string_${whitespace}${whitespace_name}_everywhere${whitespace}" | $CLICKHOUSE_LOCAL -S "c1 FixedString(64)" --input-format="CustomSeparated" --input_format_csv_trim_whitespaces=false --format_custom_escaping_rule=CSV --format_custom_field_delimiter=',' --format_csv_delimiter=',' -q "select toString(c1) from table FORMAT CSV" + echo -e "\"${whitespace}quoted_custom_csv_${whitespace}${whitespace_name}_everywhere${whitespace}\"" | $CLICKHOUSE_LOCAL --input-format="CustomSeparated" --input_format_csv_trim_whitespaces=false --format_custom_escaping_rule=CSV --format_custom_field_delimiter=',' --format_csv_delimiter=',' -q "select * from table FORMAT CSV" + + echo -e "${whitespace}custom_csv_field_with_${whitespace}structure${whitespace},123,5.0,12.0123,\"${whitespace}custom_csv_quoted_string${whitespace}\"\n${whitespace}custom_csv_field2_with_structure${whitespace},321,0.5,21.321,\"${whitespace}custom_csv_quoted_${whitespace}_string2${whitespace}\"" | $CLICKHOUSE_LOCAL --input_format_csv_trim_whitespaces=false --input-format="CustomSeparated" --format_custom_escaping_rule=CSV --format_custom_field_delimiter=',' --format_csv_delimiter=',' -q "select * from table FORMAT CSV" + echo -e "${whitespace}custom_csv_field_with_${whitespace}structure${whitespace},123,5.0,12.0123,\"${whitespace}custom_csv_quoted_string${whitespace}\"\n${whitespace}custom_csv_field2_with_structure${whitespace},321,0.5,21.321,\"${whitespace}custom_csv_quoted_${whitespace}_string2${whitespace}\"" | $CLICKHOUSE_LOCAL -S "c1 String, c2 Int32, c3 Float, c4 Double, c5 String" --input_format_csv_trim_whitespaces=false --input-format="CustomSeparated" --format_custom_escaping_rule=CSV --format_custom_field_delimiter=',' --format_csv_delimiter=',' -q "select * from table FORMAT CSV" +done From 4eb944fef11dacc95873f9e5de9949c381d4dc79 Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Thu, 25 May 2023 08:05:11 +0000 Subject: [PATCH 0133/1072] minor changes in documentation --- docs/en/operations/settings/settings-formats.md | 2 +- docs/ru/operations/settings/settings.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index cb7d98a4876..1db1b5066c3 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -893,7 +893,7 @@ Default value: `true`. Query ```bash -echo ' string ' |./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_trim_whitespaces=true +echo ' string ' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_trim_whitespaces=true ``` Result diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index e9b7091c8b8..4c2117b2b87 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -1706,7 +1706,7 @@ SELECT * FROM table_with_enum_column_for_csv_insert; Запрос ```bash -echo ' string ' |./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_trim_whitespaces=true +echo ' string ' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_trim_whitespaces=true ``` Результат From fd49821e98c9324e0a6db56dbbec55b52551225b Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 25 May 2023 12:46:53 +0200 Subject: [PATCH 0134/1072] Fix tests --- tests/integration/test_storage_s3/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index d9ac70f51ad..f983bd618e3 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -1636,7 +1636,7 @@ def test_ast_auth_headers(started_cluster): filename = "test.csv" result = instance.query_and_get_error( - f"select count() from s3('http://resolver:8080/{bucket}/{filename}', 'CSV')" + f"select count() from s3('http://resolver:8080/{bucket}/{filename}', 'CSV', 'dummy String')" ) assert "HTTP response code: 403" in result From 58610f11ab90a017a1275e1a9a0b843d17e948d8 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 25 May 2023 10:54:02 +0000 Subject: [PATCH 0135/1072] Fix tests --- tests/queries/0_stateless/01256_negative_generate_random.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/queries/0_stateless/01256_negative_generate_random.sql b/tests/queries/0_stateless/01256_negative_generate_random.sql index 14f1d947108..7e05a394b8d 100644 --- a/tests/queries/0_stateless/01256_negative_generate_random.sql +++ b/tests/queries/0_stateless/01256_negative_generate_random.sql @@ -1,5 +1,4 @@ SELECT * FROM generateRandom('i8', 1, 10, 10); -- { serverError 62 } SELECT * FROM generateRandom; -- { serverError 60 } -SELECT * FROM generateRandom(); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } SELECT * FROM generateRandom('i8 UInt8', 1, 10, 10, 10, 10); -- { serverError 42 } SELECT * FROM generateRandom('', 1, 10, 10); -- { serverError 62 } From f48845fa0d56dc81a44fd6314342849325d78b0b Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Thu, 25 May 2023 12:23:35 +0000 Subject: [PATCH 0136/1072] Fix test once again --- .../integration/runner/compose/docker_compose_mongo_secure.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/integration/runner/compose/docker_compose_mongo_secure.yml b/docker/test/integration/runner/compose/docker_compose_mongo_secure.yml index f5b0ffed130..193e5d26568 100644 --- a/docker/test/integration/runner/compose/docker_compose_mongo_secure.yml +++ b/docker/test/integration/runner/compose/docker_compose_mongo_secure.yml @@ -1,7 +1,7 @@ version: '2.3' services: mongo1: - image: mongo:3.5 + image: mongo:3.6 restart: always environment: MONGO_INITDB_ROOT_USERNAME: root From 0580859e6fa70102d3cde058040c4722d51170fc Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Thu, 25 May 2023 14:05:44 +0000 Subject: [PATCH 0137/1072] Better --- src/Parsers/TokenIterator.cpp | 4 ++-- src/Parsers/TokenIterator.h | 2 +- src/Parsers/parseQuery.cpp | 4 ++-- src/Parsers/parseQuery.h | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Parsers/TokenIterator.cpp b/src/Parsers/TokenIterator.cpp index 6b798f6f576..fa792e7c8b5 100644 --- a/src/Parsers/TokenIterator.cpp +++ b/src/Parsers/TokenIterator.cpp @@ -4,7 +4,7 @@ namespace DB { -Tokens::Tokens(const char * begin, const char * end, size_t max_query_size, bool skipp_insignificant) +Tokens::Tokens(const char * begin, const char * end, size_t max_query_size, bool skip_insignificant) { Lexer lexer(begin, end, max_query_size); @@ -13,7 +13,7 @@ Tokens::Tokens(const char * begin, const char * end, size_t max_query_size, bool { Token token = lexer.nextToken(); stop = token.isEnd() || token.type == TokenType::ErrorMaxQuerySizeExceeded; - if (token.isSignificant() || (!skipp_insignificant && !data.empty() && data.back().isSignificant())) + if (token.isSignificant() || (!skip_insignificant && !data.empty() && data.back().isSignificant())) data.emplace_back(std::move(token)); } while (!stop); } diff --git a/src/Parsers/TokenIterator.h b/src/Parsers/TokenIterator.h index 31cb644d879..192f2f55e6a 100644 --- a/src/Parsers/TokenIterator.h +++ b/src/Parsers/TokenIterator.h @@ -24,7 +24,7 @@ private: std::size_t last_accessed_index = 0; public: - Tokens(const char * begin, const char * end, size_t max_query_size = 0, bool skipp_insignificant = true); + Tokens(const char * begin, const char * end, size_t max_query_size = 0, bool skip_insignificant = true); ALWAYS_INLINE inline const Token & operator[](size_t index) { diff --git a/src/Parsers/parseQuery.cpp b/src/Parsers/parseQuery.cpp index 9f688f204a2..dd9a6023b0b 100644 --- a/src/Parsers/parseQuery.cpp +++ b/src/Parsers/parseQuery.cpp @@ -234,10 +234,10 @@ ASTPtr tryParseQuery( bool allow_multi_statements, size_t max_query_size, size_t max_parser_depth, - bool skipp_insignificant) + bool skip_insignificant) { const char * query_begin = _out_query_end; - Tokens tokens(query_begin, all_queries_end, max_query_size, skipp_insignificant); + Tokens tokens(query_begin, all_queries_end, max_query_size, skip_insignificant); /// NOTE: consider use UInt32 for max_parser_depth setting. IParser::Pos token_iterator(tokens, static_cast(max_parser_depth)); diff --git a/src/Parsers/parseQuery.h b/src/Parsers/parseQuery.h index 30f43261103..a087f145d2c 100644 --- a/src/Parsers/parseQuery.h +++ b/src/Parsers/parseQuery.h @@ -19,7 +19,7 @@ ASTPtr tryParseQuery( size_t max_query_size, /// If (end - pos) > max_query_size and query is longer than max_query_size then throws "Max query size exceeded". /// Disabled if zero. Is used in order to check query size if buffer can contains data for INSERT query. size_t max_parser_depth, - bool skipp_insignificant = true); /// If true, lexer will skip all insignificant tokens (e.g. whitespaces) + bool skip_insignificant = true); /// If true, lexer will skip all insignificant tokens (e.g. whitespaces) /// Parse query or throw an exception with error message. From ea395e9554e29f5eaa73d9fcd632f87aa4371d42 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 25 May 2023 15:24:02 +0000 Subject: [PATCH 0138/1072] Make better --- src/Formats/CapnProtoSerializer.cpp | 1152 ++++++++++++++++++--------- src/Formats/CapnProtoSerializer.h | 1 + 2 files changed, 757 insertions(+), 396 deletions(-) diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp index ff3880976c7..91e207a1846 100644 --- a/src/Formats/CapnProtoSerializer.cpp +++ b/src/Formats/CapnProtoSerializer.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -77,7 +76,7 @@ namespace struct ListBuilder : public FieldBuilder { - explicit ListBuilder(capnp::DynamicValue::Builder builder) : impl(builder.as()) + explicit ListBuilder(capnp::DynamicValue::Builder builder, UInt32 elements_size) : impl(builder.as()), nested_builders(elements_size) { } @@ -87,10 +86,6 @@ namespace struct StructBuilder : public FieldBuilder { - explicit StructBuilder(capnp::DynamicValue::Builder builder, size_t fields_size) : impl(builder.as()), field_builders(fields_size) - { - } - explicit StructBuilder(capnp::DynamicStruct::Builder struct_builder, size_t fields_size) : impl(std::move(struct_builder)), field_builders(fields_size) { } @@ -99,136 +94,144 @@ namespace std::vector> field_builders; }; - std::unique_ptr initStructFieldBuilderIfNeeded(const ColumnPtr & column, size_t row_num, capnp::DynamicStruct::Builder & struct_builder, const capnp::StructSchema::Field & field, const capnp::Type & capnp_type, size_t nested_fields_size) - { - switch (capnp_type.which()) - { - case capnp::schema::Type::LIST: - { - const auto * array_column = assert_cast(column.get()); - size_t size = array_column->getOffsets()[row_num] - array_column->getOffsets()[row_num - 1]; - return std::make_unique(struct_builder.init(field, static_cast(size))); - } - case capnp::schema::Type::STRUCT: - { - return std::make_unique(struct_builder.init(field), nested_fields_size); - } - default: - return nullptr; - } - } - class ICapnProtoSerializer { public: - virtual std::optional writeRow(const ColumnPtr & column, FieldBuilder * builder, size_t row_num) = 0; - virtual void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) = 0; + virtual void writeRow( + const ColumnPtr & column, + std::unique_ptr & builder, + capnp::DynamicStruct::Builder & parent_struct_builder, + UInt32 slot_offset, + size_t row_num) = 0; + + virtual void writeRow( + const ColumnPtr & column, + std::unique_ptr & builder, + capnp::DynamicList::Builder & parent_list_builder, + UInt32 array_index, + size_t row_num) = 0; + + virtual void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) = 0; + + virtual void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) = 0; virtual ~ICapnProtoSerializer() = default; }; - template + template class CapnProtoIntegerSerializer : public ICapnProtoSerializer { public: - std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::INT) - return capnp::DynamicValue::Reader(column->getInt(row_num)); - if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::UINT) - return capnp::DynamicValue::Reader(column->getUInt(row_num)); - return capnp::DynamicValue::Reader(column->getBool(row_num)); + auto & builder_impl = parent_struct_builder.getBuilderImpl(); + CapnProtoNumericType value = static_cast(assert_cast &>(*column).getElement(row_num)); + builder_impl.setDataField(slot_offset, value); } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - NumericType value; - if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::INT) - value = static_cast(reader.as()); - else if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::UINT) - value = static_cast(reader.as()); - else if constexpr (capnp_dynamic_type == capnp::DynamicValue::Type::BOOL) - value = static_cast(reader.as()); + auto & builder_impl = parent_list_builder.getBuilderImpl(); + CapnProtoNumericType value = static_cast(assert_cast &>(*column).getElement(row_num)); + builder_impl.setDataElement(array_index, value); + } - if constexpr (is_bool_data_type) + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + const auto & reader_impl = parent_struct_reader.getReaderImpl(); + CapnProtoNumericType value = reader_impl.getDataField(slot_offset); + if constexpr (convert_to_bool_on_read) assert_cast(column).insertValue(static_cast(value)); else - assert_cast &>(column).insertValue(value); + assert_cast &>(column).insertValue(static_cast(value)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + const auto & reader_impl = parent_list_reader.getReaderImpl(); + CapnProtoNumericType value = reader_impl.getDataElement(array_index); + if constexpr (convert_to_bool_on_read) + assert_cast(column).insertValue(static_cast(value)); + else + assert_cast &>(column).insertValue(static_cast(value)); } }; - template + template std::unique_ptr createIntegerSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) { switch (capnp_type.which()) { - case capnp::schema::Type::INT8: [[fallthrough]]; - case capnp::schema::Type::INT16: [[fallthrough]]; - case capnp::schema::Type::INT32: [[fallthrough]]; + case capnp::schema::Type::INT8: + return std::make_unique>(); + case capnp::schema::Type::INT16: + return std::make_unique>(); + case capnp::schema::Type::INT32: + return std::make_unique>(); case capnp::schema::Type::INT64: - return std::make_unique>(); - case capnp::schema::Type::UINT8: [[fallthrough]]; - case capnp::schema::Type::UINT16: [[fallthrough]]; - case capnp::schema::Type::UINT32: [[fallthrough]]; + return std::make_unique>(); + case capnp::schema::Type::UINT8: + return std::make_unique>(); + case capnp::schema::Type::UINT16: + return std::make_unique>(); + case capnp::schema::Type::UINT32: + return std::make_unique>(); case capnp::schema::Type::UINT64: - return std::make_unique>(); + return std::make_unique>(); case capnp::schema::Type::BOOL: - return std::make_unique>(); + return std::make_unique>(); default: throwCannotConvert(data_type, column_name, capnp_type); } } - template - class CapnProtoBigIntegerSerializer : public ICapnProtoSerializer - { - public: - CapnProtoBigIntegerSerializer(const DataTypePtr & data_type_, const String & column_name, const capnp::Type & capnp_type) : data_type(data_type_) - { - if (!capnp_type.isData()) - throwCannotConvert(data_type, column_name, capnp_type); - } - - std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override - { - auto data = column->getDataAt(row_num); - return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); - } - - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override - { - auto value = reader.as(); - if (value.size() != sizeof(NumericType)) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", data_type->getName(), value.size()); - - column.insertData(reinterpret_cast(value.begin()), value.size()); - } - - private: - DataTypePtr data_type; - }; - - template + template class CapnProtoFloatSerializer : public ICapnProtoSerializer { public: - CapnProtoFloatSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - if (!capnp_type.isFloat32() && !capnp_type.isFloat64()) - throwCannotConvert(data_type, column_name, capnp_type); + auto & builder_impl = parent_struct_builder.getBuilderImpl(); + CapnProtoFloatType value = static_cast(assert_cast &>(*column).getElement(row_num)); + builder_impl.setDataField(slot_offset, value); } - std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - return capnp::DynamicValue::Reader(column->getFloat64(row_num)); + auto & builder_impl = parent_list_builder.getBuilderImpl(); + CapnProtoFloatType value = static_cast(assert_cast &>(*column).getElement(row_num)); + builder_impl.setDataElement(array_index, value); } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { - assert_cast &>(column).insertValue(reader.as()); + const auto & reader_impl = parent_struct_reader.getReaderImpl(); + CapnProtoFloatType value = reader_impl.getDataField(slot_offset); + assert_cast &>(column).insertValue(static_cast(value)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + const auto & reader_impl = parent_list_reader.getReaderImpl(); + CapnProtoFloatType value = reader_impl.getDataElement(array_index); + assert_cast &>(column).insertValue(static_cast(value)); } }; + template + std::unique_ptr createFloatSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + switch (capnp_type.which()) + { + case capnp::schema::Type::FLOAT32: + return std::make_unique>(); + case capnp::schema::Type::FLOAT64: + return std::make_unique>(); + default: + throwCannotConvert(data_type, column_name, capnp_type); + } + } + template class CapnProtoEnumSerializer : public ICapnProtoSerializer { @@ -267,86 +270,90 @@ namespace } else { - auto names = enum_values.getSetOfAllNames(to_lower); - std::unordered_set capn_enum_names; - - for (auto enumerant : enumerants) - { - String name = enumerant.getProto().getName(); - capn_enum_names.insert(to_lower ? boost::algorithm::to_lower_copy(name) : name); - } - - if (names != capn_enum_names) + auto all_values = enum_values.getValues(); + if (all_values.size() != enumerants.size()) throw Exception( ErrorCodes::CAPN_PROTO_BAD_CAST, "The set of names in Enum from CapnProto schema is different from the set of names in ClickHouse Enum"); + + std::unordered_map ch_name_to_value; + for (auto & [name, value] : all_values) + ch_name_to_value[to_lower ? boost::algorithm::to_lower_copy(name) : name] = value; + + for (auto enumerant : enumerants) + { + String capnp_name = enumerant.getProto().getName(); + UInt16 capnp_value = enumerant.getOrdinal(); + auto it = ch_name_to_value.find(to_lower ? boost::algorithm::to_lower_copy(capnp_name) : capnp_name); + if (it == ch_name_to_value.end()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "The set of names in Enum from CapnProto schema is different from the set of names in ClickHouse Enum"); + + ch_to_capnp_values[it->second] = capnp_value; + capnp_to_ch_values[capnp_value] = it->second; + } } } - std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - const auto * enum_data_type = assert_cast *>(data_type.get()); + auto & builder_impl = parent_struct_builder.getBuilderImpl(); EnumType enum_value = assert_cast &>(*column).getElement(row_num); + UInt16 capnp_value; if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) - return capnp::DynamicValue::Reader(capnp::DynamicEnum(enum_schema, enum_value)); + capnp_value = static_cast(enum_value); + else + capnp_value = ch_to_capnp_values[enum_value]; - auto enum_name = enum_data_type->getNameForValue(enum_value); - for (const auto enumerant : enum_schema.getEnumerants()) - { - if (compareEnumNames(String(enum_name), enumerant.getProto().getName(), enum_comparing_mode)) - return capnp::DynamicValue::Reader(capnp::DynamicEnum(enumerant)); - } - - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot convert ClickHouse Enum value to CapnProto Enum"); + builder_impl.setDataField(slot_offset, capnp_value); } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - auto enum_value = reader.as(); - auto enumerant = *kj::_::readMaybe(enum_value.getEnumerant()); - auto enum_type = assert_cast *>(data_type.get()); - DataTypePtr nested_type = std::make_shared>(); - switch (enum_comparing_mode) - { - case FormatSettings::CapnProtoEnumComparingMode::BY_VALUES: - { - assert_cast &>(column).insertValue(static_cast(enumerant.getOrdinal())); - return; - } - case FormatSettings::CapnProtoEnumComparingMode::BY_NAMES: - { - auto value = enum_type->getValue(String(enumerant.getProto().getName())); - assert_cast &>(column).insertValue(value); - return; - } - case FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE: - { - /// Find the same enum name case insensitive. - String enum_name = enumerant.getProto().getName(); - for (auto & name : enum_type->getAllRegisteredNames()) - { - if (compareEnumNames(name, enum_name, enum_comparing_mode)) - { - assert_cast &>(column).insertValue(enum_type->getValue(name)); - break; - } - } - return; - } - } + auto & builder_impl = parent_list_builder.getBuilderImpl(); + EnumType enum_value = assert_cast &>(*column).getElement(row_num); + UInt16 capnp_value; + if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) + capnp_value = static_cast(enum_value); + else + capnp_value = ch_to_capnp_values[enum_value]; + + builder_impl.setDataElement(array_index, capnp_value); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + const auto & reader_impl = parent_struct_reader.getReaderImpl(); + UInt16 capnp_value = reader_impl.getDataField(slot_offset); + EnumType value; + if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) + value = static_cast(capnp_value); + else + value = capnp_to_ch_values[capnp_value]; + + assert_cast &>(column).insertValue(value); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + const auto & reader_impl = parent_list_reader.getReaderImpl(); + UInt16 capnp_value = reader_impl.getDataElement(array_index); + EnumType value; + if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) + value = static_cast(capnp_value); + else + value = capnp_to_ch_values[capnp_value]; + + assert_cast &>(column).insertValue(value); } private: - bool compareEnumNames(const String & first, const String & second, const FormatSettings::CapnProtoEnumComparingMode mode) - { - if (mode == FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE) - return boost::algorithm::to_lower_copy(first) == boost::algorithm::to_lower_copy(second); - return first == second; - } - DataTypePtr data_type; capnp::EnumSchema enum_schema; const FormatSettings::CapnProtoEnumComparingMode enum_comparing_mode; + std::unordered_map ch_to_capnp_values; + std::unordered_map capnp_to_ch_values; }; class CapnProtoDateSerializer : public ICapnProtoSerializer @@ -358,14 +365,32 @@ namespace throwCannotConvert(data_type, column_name, capnp_type); } - std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - return capnp::DynamicValue::Reader(column->getUInt(row_num)); + auto & builder_impl = parent_struct_builder.getBuilderImpl(); + UInt16 value = assert_cast(*column).getElement(row_num); + builder_impl.setDataField(slot_offset, value); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + auto & builder_impl = parent_list_builder.getBuilderImpl(); + UInt16 value = assert_cast(*column).getElement(row_num); + builder_impl.setDataElement(array_index, value); } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { - assert_cast(column).insertValue(reader.as()); + const auto & reader_impl = parent_struct_reader.getReaderImpl(); + UInt16 value = reader_impl.getDataField(slot_offset); + assert_cast(column).insertValue(value); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + const auto & reader_impl = parent_list_reader.getReaderImpl(); + UInt16 value = reader_impl.getDataElement(array_index); + assert_cast(column).insertValue(value); } }; @@ -378,14 +403,32 @@ namespace throwCannotConvert(data_type, column_name, capnp_type); } - std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - return capnp::DynamicValue::Reader(column->getInt(row_num)); + auto & builder_impl = parent_struct_builder.getBuilderImpl(); + Int32 value = assert_cast(*column).getElement(row_num); + builder_impl.setDataField(slot_offset, value); } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - assert_cast(column).insertValue(reader.as()); + auto & builder_impl = parent_list_builder.getBuilderImpl(); + Int32 value = assert_cast(*column).getElement(row_num); + builder_impl.setDataElement(array_index, value); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + const auto & reader_impl = parent_struct_reader.getReaderImpl(); + Int32 value = reader_impl.getDataField(slot_offset); + assert_cast(column).insertValue(value); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + const auto & reader_impl = parent_list_reader.getReaderImpl(); + Int32 value = reader_impl.getDataElement(array_index); + assert_cast(column).insertValue(value); } }; @@ -398,14 +441,32 @@ namespace throwCannotConvert(data_type, column_name, capnp_type); } - std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - return capnp::DynamicValue::Reader(column->getInt(row_num)); + auto & builder_impl = parent_struct_builder.getBuilderImpl(); + UInt32 value = assert_cast(*column).getElement(row_num); + builder_impl.setDataField(slot_offset, value); } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - assert_cast(column).insertValue(reader.as()); + auto & builder_impl = parent_list_builder.getBuilderImpl(); + UInt32 value = assert_cast(*column).getElement(row_num); + builder_impl.setDataElement(array_index, value); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + const auto & reader_impl = parent_struct_reader.getReaderImpl(); + UInt32 value = reader_impl.getDataField(slot_offset); + assert_cast(column).insertValue(value); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + const auto & reader_impl = parent_list_reader.getReaderImpl(); + UInt32 value = reader_impl.getDataElement(array_index); + assert_cast(column).insertValue(value); } }; @@ -418,14 +479,32 @@ namespace throwCannotConvert(data_type, column_name, capnp_type); } - std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - return capnp::DynamicValue::Reader(column->getInt(row_num)); + auto & builder_impl = parent_struct_builder.getBuilderImpl(); + Int64 value = assert_cast(*column).getElement(row_num); + builder_impl.setDataField(slot_offset, value); } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - assert_cast(column).insertValue(reader.as()); + auto & builder_impl = parent_list_builder.getBuilderImpl(); + Int64 value = assert_cast(*column).getElement(row_num); + builder_impl.setDataElement(array_index, value); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + const auto & reader_impl = parent_struct_reader.getReaderImpl(); + Int64 value = reader_impl.getDataField(slot_offset); + assert_cast(column).insertValue(value); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + const auto & reader_impl = parent_list_reader.getReaderImpl(); + Int64 value = reader_impl.getDataElement(array_index); + assert_cast(column).insertValue(value); } }; @@ -433,6 +512,8 @@ namespace class CapnProtoDecimalSerializer : public ICapnProtoSerializer { public: + using NativeType = typename DecimalType::NativeType; + CapnProtoDecimalSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) { auto which = WhichDataType(data_type); @@ -440,37 +521,79 @@ namespace throwCannotConvert(data_type, column_name, capnp_type); } - std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - return capnp::DynamicValue::Reader(column->getInt(row_num)); + auto & builder_impl = parent_struct_builder.getBuilderImpl(); + DecimalType value = assert_cast &>(*column).getElement(row_num); + builder_impl.setDataField(slot_offset, value); } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - assert_cast &>(column).insertValue(reader.as()); + auto & builder_impl = parent_list_builder.getBuilderImpl(); + DecimalType value = assert_cast &>(*column).getElement(row_num); + builder_impl.setDataElement(array_index, value); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + const auto & reader_impl = parent_struct_reader.getReaderImpl(); + NativeType value = reader_impl.getDataField(slot_offset); + assert_cast &>(column).insertValue(value); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + const auto & reader_impl = parent_list_reader.getReaderImpl(); + NativeType value = reader_impl.getDataElement(array_index); + assert_cast &>(column).insertValue(value); } }; - template - class CapnProtoBigDecimalSerializer : public ICapnProtoSerializer + template + class CapnProtoFixedSizeRawDataSerializer : public ICapnProtoSerializer { + private: + static constexpr size_t value_size = sizeof(T); + public: - CapnProtoBigDecimalSerializer(const DataTypePtr & data_type_, const String & column_name, const capnp::Type & capnp_type) : data_type(data_type_) + CapnProtoFixedSizeRawDataSerializer(const DataTypePtr & data_type_, const String & column_name, const capnp::Type & capnp_type) : data_type(data_type_) { if (!capnp_type.isData()) throwCannotConvert(data_type, column_name, capnp_type); } - std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { + auto & builder_impl = parent_struct_builder.getBuilderImpl(); auto data = column->getDataAt(row_num); - return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + capnp::Data::Reader value = capnp::Data::Reader(reinterpret_cast(data.data), data.size); + builder_impl.getPointerField(slot_offset).template setBlob(value); } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_struct_builder, UInt32 array_index, size_t row_num) override { - auto value = reader.as(); - if (value.size() != sizeof(DecimalType)) + auto & builder_impl = parent_struct_builder.getBuilderImpl(); + auto data = column->getDataAt(row_num); + capnp::Data::Reader value = capnp::Data::Reader(reinterpret_cast(data.data), data.size); + builder_impl.getPointerElement(array_index).setBlob(value); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + const auto & reader_impl = parent_struct_reader.getReaderImpl(); + capnp::Data::Reader value = reader_impl.getPointerField(slot_offset).template getBlob(nullptr, 0); + if (value.size() != value_size) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", data_type->getName(), value.size()); + + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + const auto & reader_impl = parent_list_reader.getReaderImpl(); + capnp::Data::Reader value = reader_impl.getPointerElement(array_index).getBlob(nullptr, 0); + if (value.size() != value_size) throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", data_type->getName(), value.size()); column.insertData(reinterpret_cast(value.begin()), value.size()); @@ -484,39 +607,73 @@ namespace class CapnProtoStringSerializer : public ICapnProtoSerializer { public: - CapnProtoStringSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type_) : capnp_type(capnp_type_) + CapnProtoStringSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) { if (!capnp_type.isData() && !capnp_type.isText()) throwCannotConvert(data_type, column_name, capnp_type); } - std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { + auto & builder_impl = parent_struct_builder.getBuilderImpl(); auto data = column->getDataAt(row_num); - - if constexpr (is_binary) - return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); - - /// For type TEXT data must be null-terminated, but in String column we always have 0 byte at the end of each value. - return capnp::DynamicValue::Reader(capnp::Text::Reader(data.data, data.size)); - } - - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override - { if constexpr (is_binary) { - auto value = reader.as(); + capnp::Data::Reader value = capnp::Data::Reader(reinterpret_cast(data.data), data.size); + builder_impl.getPointerField(slot_offset).setBlob(value); + } + else + { + capnp::Text::Reader value = capnp::Text::Reader(data.data, data.size); + builder_impl.getPointerField(slot_offset).setBlob(value); + } + } + + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_struct_builder, UInt32 array_index, size_t row_num) override + { + auto & builder_impl = parent_struct_builder.getBuilderImpl(); + auto data = column->getDataAt(row_num); + if constexpr (is_binary) + { + capnp::Data::Reader value = capnp::Data::Reader(reinterpret_cast(data.data), data.size); + builder_impl.getPointerElement(array_index).setBlob(value); + } + else + { + capnp::Text::Reader value = capnp::Text::Reader(data.data, data.size); + builder_impl.getPointerElement(array_index).setBlob(value); + } + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + const auto & reader_impl = parent_struct_reader.getReaderImpl(); + if constexpr (is_binary) + { + capnp::Data::Reader value = reader_impl.getPointerField(slot_offset).getBlob(nullptr, 0); column.insertData(reinterpret_cast(value.begin()), value.size()); } else { - auto value = reader.as(); + capnp::Text::Reader value = reader_impl.getPointerField(slot_offset).getBlob(nullptr, 0); column.insertData(reinterpret_cast(value.begin()), value.size()); } } - private: - capnp::Type capnp_type; + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + const auto & reader_impl = parent_list_reader.getReaderImpl(); + if constexpr (is_binary) + { + capnp::Data::Reader value = reader_impl.getPointerElement(array_index).getBlob(nullptr, 0); + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + else + { + capnp::Text::Reader value = reader_impl.getPointerElement(array_index).getBlob(nullptr, 0); + column.insertData(reinterpret_cast(value.begin()), value.size()); + } + } }; template @@ -529,29 +686,71 @@ namespace throwCannotConvert(data_type, column_name, capnp_type); } - std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { + auto & builder_impl = parent_struct_builder.getBuilderImpl(); auto data = column->getDataAt(row_num); if constexpr (is_binary) - return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); - - if (data.data[data.size - 1] == 0) - return capnp::DynamicValue::Reader(capnp::Text::Reader(reinterpret_cast(data.data), data.size)); - - /// In TEXT type data should be null-terminated, but ClickHouse FixedString data could not be. - /// To make data null-terminated we should copy it to temporary String object and use it in capnp::Text::Reader. - /// Note that capnp::Text::Reader works only with pointer to the data and it's size, so we should - /// guarantee that new String object life time is longer than capnp::Text::Reader life time. - tmp_string = data.toString(); - return capnp::DynamicValue::Reader(capnp::Text::Reader(reinterpret_cast(tmp_string.data()), tmp_string.size())); + { + capnp::Data::Reader value = capnp::Data::Reader(reinterpret_cast(data.data), data.size); + builder_impl.getPointerField(slot_offset).setBlob(value); + } + else + { + if (data.data[data.size - 1] == 0) + { + capnp::Text::Reader value = capnp::Text::Reader(data.data, data.size); + builder_impl.getPointerField(slot_offset).setBlob(value); + } + else + { + /// In TEXT type data should be null-terminated, but ClickHouse FixedString data could not be. + /// To make data null-terminated we should copy it to temporary String object and use it in capnp::Text::Reader. + /// Note that capnp::Text::Reader works only with pointer to the data and it's size, so we should + /// guarantee that new String object life time is longer than capnp::Text::Reader life time. + tmp_string = data.toString(); + capnp::Text::Reader value = capnp::Text::Reader(tmp_string.data(), tmp_string.size()); + builder_impl.getPointerField(slot_offset).setBlob(value); + } + } } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_struct_builder, UInt32 array_index, size_t row_num) override { + auto & builder_impl = parent_struct_builder.getBuilderImpl(); + auto data = column->getDataAt(row_num); + if constexpr (is_binary) + { + capnp::Data::Reader value = capnp::Data::Reader(reinterpret_cast(data.data), data.size); + builder_impl.getPointerElement(array_index).setBlob(value); + } + else + { + if (data.data[data.size - 1] == 0) + { + capnp::Text::Reader value = capnp::Text::Reader(data.data, data.size); + builder_impl.getPointerElement(array_index).setBlob(value); + } + else + { + /// In TEXT type data should be null-terminated, but ClickHouse FixedString data could not be. + /// To make data null-terminated we should copy it to temporary String object and use it in capnp::Text::Reader. + /// Note that capnp::Text::Reader works only with pointer to the data and it's size, so we should + /// guarantee that new String object life time is longer than capnp::Text::Reader life time. + tmp_string = data.toString(); + capnp::Text::Reader value = capnp::Text::Reader(tmp_string.data(), tmp_string.size()); + builder_impl.getPointerElement(array_index).setBlob(value); + } + } + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + const auto & reader_impl = parent_struct_reader.getReaderImpl(); auto & fixed_string_column = assert_cast(column); if constexpr (is_binary) { - auto value = reader.as(); + capnp::Data::Reader value = reader_impl.getPointerField(slot_offset).getBlob(nullptr, 0); if (value.size() > fixed_string_column.getN()) throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot read data with size {} to FixedString with size {}", value.size(), fixed_string_column.getN()); @@ -559,7 +758,29 @@ namespace } else { - auto value = reader.as(); + capnp::Text::Reader value = reader_impl.getPointerField(slot_offset).getBlob(nullptr, 0); + if (value.size() > fixed_string_column.getN()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot read data with size {} to FixedString with size {}", value.size(), fixed_string_column.getN()); + + fixed_string_column.insertData(reinterpret_cast(value.begin()), value.size()); + } + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + const auto & reader_impl = parent_list_reader.getReaderImpl(); + auto & fixed_string_column = assert_cast(column); + if constexpr (is_binary) + { + capnp::Data::Reader value = reader_impl.getPointerElement(array_index).getBlob(nullptr, 0); + if (value.size() > fixed_string_column.getN()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot read data with size {} to FixedString with size {}", value.size(), fixed_string_column.getN()); + + fixed_string_column.insertData(reinterpret_cast(value.begin()), value.size()); + } + else + { + capnp::Text::Reader value = reader_impl.getPointerElement(array_index).getBlob(nullptr, 0); if (value.size() > fixed_string_column.getN()) throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot read data with size {} to FixedString with size {}", value.size(), fixed_string_column.getN()); @@ -581,64 +802,32 @@ namespace throwCannotConvert(data_type, column_name, capnp_type); } - std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - return capnp::DynamicValue::Reader(assert_cast(*column).getElement(row_num).toUnderType()); + auto & builder_impl = parent_struct_builder.getBuilderImpl(); + UInt32 value = assert_cast(*column).getElement(row_num); + builder_impl.setDataField(slot_offset, value); } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - assert_cast(column).insertValue(IPv4(reader.as())); - } - }; - - class CapnProtoIPv6Serializer : public ICapnProtoSerializer - { - public: - CapnProtoIPv6Serializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) - { - if (!capnp_type.isData()) - throwCannotConvert(data_type, column_name, capnp_type); + auto & builder_impl = parent_list_builder.getBuilderImpl(); + UInt32 value = assert_cast(*column).getElement(row_num); + builder_impl.setDataElement(array_index, value); } - std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { - auto data = column->getDataAt(row_num); - return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); + const auto & reader_impl = parent_struct_reader.getReaderImpl(); + UInt32 value = reader_impl.getDataField(slot_offset); + assert_cast(column).insertValue(IPv4(value)); } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override { - auto value = reader.as(); - if (value.size() != sizeof(IPv6)) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of IPv6 value: {}", value.size()); - - column.insertData(reinterpret_cast(value.begin()), value.size()); - } - }; - - class CapnProtoUUIDSerializer : public ICapnProtoSerializer - { - public: - CapnProtoUUIDSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) - { - if (!capnp_type.isData()) - throwCannotConvert(data_type, column_name, capnp_type); - } - - std::optional writeRow(const ColumnPtr & column, FieldBuilder *, size_t row_num) override - { - auto data = column->getDataAt(row_num); - return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast(data.data), data.size)); - } - - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override - { - auto value = reader.as(); - if (value.size() != sizeof(UUID)) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of UUID value: {}", value.size()); - - column.insertData(reinterpret_cast(value.begin()), value.size()); + const auto & reader_impl = parent_list_reader.getReaderImpl(); + UInt32 value = reader_impl.getDataElement(array_index); + assert_cast(column).insertValue(IPv4(value)); } }; @@ -652,19 +841,35 @@ namespace nested_serializer = createSerializer(assert_cast(*data_type).getDictionaryType(), column_name, capnp_type, settings); } - std::optional writeRow(const ColumnPtr & column, FieldBuilder * field_builder, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { const auto & low_cardinality_column = assert_cast(*column); size_t index = low_cardinality_column.getIndexAt(row_num); const auto & dict_column = low_cardinality_column.getDictionary().getNestedColumn(); - return nested_serializer->writeRow(dict_column, field_builder, index); + nested_serializer->writeRow(dict_column, field_builder, parent_struct_builder, slot_offset, index); } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + const auto & low_cardinality_column = assert_cast(*column); + size_t index = low_cardinality_column.getIndexAt(row_num); + const auto & dict_column = low_cardinality_column.getDictionary().getNestedColumn(); + nested_serializer->writeRow(dict_column, field_builder, parent_list_builder, array_index, index); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { auto & low_cardinality_column = assert_cast(column); auto tmp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); - nested_serializer->readRow(*tmp_column, reader); + nested_serializer->readRow(*tmp_column, parent_struct_reader, slot_offset); + low_cardinality_column.insertFromFullColumn(*tmp_column, 0); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + auto & low_cardinality_column = assert_cast(column); + auto tmp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); + nested_serializer->readRow(*tmp_column, parent_list_reader, array_index); low_cardinality_column.insertFromFullColumn(*tmp_column, 0); } @@ -685,7 +890,10 @@ namespace getCapnProtoFullTypeName(capnp_type)); /// Check that struct is a named union of type VOID and one arbitrary type. - auto struct_schema = capnp_type.asStruct(); + struct_schema = capnp_type.asStruct(); + auto node = struct_schema.getProto().getStruct(); + struct_size = capnp::_::StructSize(node.getDataWordCount(), node.getPointerCount()); + discriminant_offset = node.getDiscriminantOffset(); if (!checkIfStructIsNamedUnion(struct_schema)) throw Exception( ErrorCodes::CAPN_PROTO_BAD_CAST, @@ -706,23 +914,18 @@ namespace auto first = union_fields[0]; auto second = union_fields[1]; auto nested_type = assert_cast(data_type.get())->getNestedType(); + nested_slot_offset = first.getProto().getSlot().getOffset(); /// Both fields have the same offset. if (first.getType().isVoid()) { - null_field = first; - nested_field = second; - nested_capnp_type = second.getType(); - if (nested_capnp_type.isStruct()) - nested_fields_size = nested_capnp_type.asStruct().getFields().size(); - nested_serializer = createSerializer(nested_type, column_name, nested_capnp_type, settings); + nested_serializer = createSerializer(nested_type, column_name, second.getType(), settings); + null_discriminant = 0; + nested_discriminant = 1; } else if (second.getType().isVoid()) { - null_field = second; - nested_field = first; - nested_capnp_type = first.getType(); - if (nested_capnp_type.isStruct()) - nested_fields_size = nested_capnp_type.asStruct().getFields().size(); - nested_serializer = createSerializer(nested_type, column_name, nested_capnp_type, settings); + nested_serializer = createSerializer(nested_type, column_name, first.getType(), settings); + null_discriminant = 1; + nested_discriminant = 0; } else throw Exception( @@ -733,50 +936,102 @@ namespace getCapnProtoFullTypeName(capnp_type)); } - std::optional writeRow(const ColumnPtr & column, FieldBuilder * field_builder, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - assert(field_builder); + if (!field_builder) + { + auto builder_impl = parent_struct_builder.getBuilderImpl(); + auto struct_builder_impl = capnp::DynamicStruct::Builder(struct_schema, builder_impl.getPointerField(slot_offset).initStruct(struct_size)); + field_builder = std::make_unique(std::move(struct_builder_impl), 1); + } + auto & struct_builder = assert_cast(*field_builder); + const auto & nullable_column = assert_cast(*column); if (nullable_column.isNullAt(row_num)) { - struct_builder.impl.set(null_field, capnp::Void()); + auto struct_builder_impl = struct_builder.impl.getBuilderImpl(); + struct_builder_impl.setDataField(discriminant_offset, null_discriminant); + struct_builder_impl.setDataField(nested_slot_offset, capnp::Void()); } else { - struct_builder.impl.clear(nested_field); const auto & nested_column = nullable_column.getNestedColumnPtr(); - auto nested_field_builder = initStructFieldBuilderIfNeeded(nested_column, row_num, struct_builder.impl, nested_field, nested_capnp_type, nested_fields_size); - auto value = nested_serializer->writeRow(nested_column, nested_field_builder.get(), row_num); - if (value) - struct_builder.impl.set(nested_field, *value); + struct_builder.impl.getBuilderImpl().setDataField(discriminant_offset, nested_discriminant); + nested_serializer->writeRow(nested_column, struct_builder.field_builders[0], struct_builder.impl, nested_slot_offset, row_num); } - - return std::nullopt; } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + if (!field_builder) + { + auto builder_impl = parent_list_builder.getBuilderImpl(); + auto struct_builder_impl = capnp::DynamicStruct::Builder(struct_schema, builder_impl.getStructElement(array_index)); + field_builder = std::make_unique(std::move(struct_builder_impl), 1); + } + + auto & struct_builder = assert_cast(*field_builder); + + const auto & nullable_column = assert_cast(*column); + if (nullable_column.isNullAt(row_num)) + { + auto struct_builder_impl = struct_builder.impl.getBuilderImpl(); + struct_builder_impl.setDataField(discriminant_offset, null_discriminant); + struct_builder_impl.setDataField(nested_slot_offset, capnp::Void()); + } + else + { + const auto & nested_column = nullable_column.getNestedColumnPtr(); + struct_builder.impl.getBuilderImpl().setDataField(discriminant_offset, nested_discriminant); + nested_serializer->writeRow(nested_column, struct_builder.field_builders[0], struct_builder.impl, nested_slot_offset, row_num); + } + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { - auto struct_reader = reader.as(); auto & nullable_column = assert_cast(column); - auto field = *kj::_::readMaybe(struct_reader.which()); - if (field.getType().isVoid()) + auto reader_impl = parent_struct_reader.getReaderImpl(); + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, reader_impl.getPointerField(slot_offset).getStruct(nullptr)); + + auto discriminant = struct_reader.getReaderImpl().getDataField(discriminant_offset); + + if (discriminant == null_discriminant) nullable_column.insertDefault(); else { auto & nested_column = nullable_column.getNestedColumn(); - auto nested_reader = struct_reader.get(field); - nested_serializer->readRow(nested_column, nested_reader); + nested_serializer->readRow(nested_column, struct_reader, nested_slot_offset); + nullable_column.getNullMapData().push_back(0); + } + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + auto & nullable_column = assert_cast(column); + auto reader_impl = parent_list_reader.getReaderImpl(); + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, reader_impl.getStructElement(array_index)); + + auto discriminant = struct_reader.getReaderImpl().getDataField(discriminant_offset); + + if (discriminant == null_discriminant) + nullable_column.insertDefault(); + else + { + auto & nested_column = nullable_column.getNestedColumn(); + nested_serializer->readRow(nested_column, struct_reader, nested_slot_offset); nullable_column.getNullMapData().push_back(0); } } private: std::unique_ptr nested_serializer; - capnp::StructSchema::Field null_field; - capnp::StructSchema::Field nested_field; - size_t nested_fields_size = 0; - capnp::Type nested_capnp_type; + capnp::StructSchema struct_schema; + capnp::_::StructSize struct_size; + UInt32 discriminant_offset; + UInt16 null_discriminant; + UInt16 nested_discriminant; + UInt32 nested_slot_offset; }; class CapnProtoArraySerializer : public ICapnProtoSerializer @@ -788,67 +1043,102 @@ namespace throwCannotConvert(data_type, column_name, capnp_type); auto nested_type = assert_cast(data_type.get())->getNestedType(); - element_type = capnp_type.asList().getElementType(); + list_schema = capnp_type.asList(); + auto element_type = list_schema.getElementType(); + element_size = capnp::elementSizeFor(element_type.which()); if (element_type.isStruct()) - element_struct_fields = element_type.asStruct().getFields().size(); + { + element_is_struct = true; + auto node = element_type.asStruct().getProto().getStruct(); + element_struct_size = capnp::_::StructSize(node.getDataWordCount(), node.getPointerCount()); + } + nested_serializer = createSerializer(nested_type, column_name, capnp_type.asList().getElementType(), settings); } - std::optional writeRow(const ColumnPtr & column, FieldBuilder * field_builder, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - assert(field_builder); - auto & list_builder = assert_cast(*field_builder); const auto * array_column = assert_cast(column.get()); const auto & nested_column = array_column->getDataPtr(); const auto & offsets = array_column->getOffsets(); auto offset = offsets[row_num - 1]; - size_t size = offsets[row_num] - offset; - bool need_nested_builders = list_builder.nested_builders.empty(); - for (unsigned i = 0; i != static_cast(size); ++i) - { - if (need_nested_builders) - { - /// For nested lists we need to initialize nested list builder. - if (element_type.isList()) - { - const auto & nested_offset = checkAndGetColumn(*nested_column)->getOffsets(); - size_t nested_array_size = nested_offset[offset + i] - nested_offset[offset + i - 1]; - list_builder.nested_builders.emplace_back(std::make_unique(list_builder.impl.init(i, static_cast(nested_array_size)))); - } - else if (element_type.isStruct()) - { - list_builder.nested_builders.emplace_back(std::make_unique(list_builder.impl[i], element_struct_fields)); - } - else - { - list_builder.nested_builders.emplace_back(); - } - } + UInt32 size = static_cast(offsets[row_num] - offset); - auto value = nested_serializer->writeRow(nested_column, list_builder.nested_builders[i].get(), offset + i); - if (value) - list_builder.impl.set(i, *value); + if (!field_builder) + { + auto builder_impl = parent_struct_builder.getBuilderImpl(); + capnp::DynamicList::Builder list_builder_impl; + if (element_is_struct) + list_builder_impl = capnp::DynamicList::Builder(list_schema, builder_impl.getPointerField(slot_offset).initStructList(size, element_struct_size)); + else + list_builder_impl = capnp::DynamicList::Builder(list_schema, builder_impl.getPointerField(slot_offset).initList(element_size, size)); + field_builder = std::make_unique(std::move(list_builder_impl), size); } - return std::nullopt; + auto & list_builder = assert_cast(*field_builder); + for (UInt32 i = 0; i != size; ++i) + nested_serializer->writeRow(nested_column, list_builder.nested_builders[i], list_builder.impl, i, offset + i); } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - auto list_reader = reader.as(); + const auto * array_column = assert_cast(column.get()); + const auto & nested_column = array_column->getDataPtr(); + const auto & offsets = array_column->getOffsets(); + auto offset = offsets[row_num - 1]; + UInt32 size = static_cast(offsets[row_num] - offset); + + if (!field_builder) + { + auto builder_impl = parent_list_builder.getBuilderImpl(); + capnp::DynamicList::Builder list_builder_impl; + if (element_is_struct) + list_builder_impl = capnp::DynamicList::Builder(list_schema, builder_impl.getPointerElement(array_index).initStructList(size, element_struct_size)); + else + list_builder_impl = capnp::DynamicList::Builder(list_schema, builder_impl.getPointerElement(array_index).initList(element_size, size)); + field_builder = std::make_unique(std::move(list_builder_impl), size); + } + + auto & list_builder = assert_cast(*field_builder); + for (UInt32 i = 0; i != size; ++i) + nested_serializer->writeRow(nested_column, list_builder.nested_builders[i], list_builder.impl, i, offset + i); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + const auto & reader_impl = parent_struct_reader.getReaderImpl(); + auto list_reader = capnp::DynamicList::Reader(list_schema, reader_impl.getPointerField(slot_offset).getList(element_size, nullptr)); + UInt32 size = list_reader.size(); auto & column_array = assert_cast(column); auto & offsets = column_array.getOffsets(); offsets.push_back(offsets.back() + list_reader.size()); auto & nested_column = column_array.getData(); - for (const auto & nested_reader : list_reader) - nested_serializer->readRow(nested_column, nested_reader); + for (UInt32 i = 0; i != size; ++i) + nested_serializer->readRow(nested_column, list_reader, i); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + const auto & reader_impl = parent_list_reader.getReaderImpl(); + auto list_reader = capnp::DynamicList::Reader(list_schema, reader_impl.getPointerElement(array_index).getList(element_size, nullptr)); + UInt32 size = list_reader.size(); + auto & column_array = assert_cast(column); + auto & offsets = column_array.getOffsets(); + offsets.push_back(offsets.back() + list_reader.size()); + + auto & nested_column = column_array.getData(); + for (UInt32 i = 0; i != size; ++i) + nested_serializer->readRow(nested_column, list_reader, i); } private: + capnp::ListSchema list_schema; std::unique_ptr nested_serializer; - capnp::Type element_type; - size_t element_struct_fields; + capnp::ElementSize element_size; + capnp::_::StructSize element_struct_size; + bool element_is_struct = false; + }; class CapnProtoMapSerializer : public ICapnProtoSerializer @@ -869,7 +1159,9 @@ namespace if (!capnp_type.isStruct()) throwCannotConvert(data_type, column_name, capnp_type); - auto struct_schema = capnp_type.asStruct(); + struct_schema = capnp_type.asStruct(); + auto node = struct_schema.getProto().getStruct(); + struct_size = capnp::_::StructSize(node.getDataWordCount(), node.getPointerCount()); if (checkIfStructContainsUnnamedUnion(struct_schema)) throw Exception( @@ -921,43 +1213,70 @@ namespace DataTypes types = {map_type.getKeyType(), map_type.getValueType()}; Names names = {"key", "value"}; auto entries_type = std::make_shared(std::make_shared(types, names)); - entries_field = struct_schema.getFields()[0]; - entries_capnp_type = entries_field.getType(); nested_serializer = createSerializer(entries_type, column_name, field_type, settings); + entries_slot_offset = struct_schema.getFields()[0].getProto().getSlot().getOffset(); } - std::optional writeRow(const ColumnPtr & column, FieldBuilder * field_builder, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - assert(field_builder); + if (!field_builder) + { + auto builder_impl = parent_struct_builder.getBuilderImpl(); + auto struct_builder_impl = capnp::DynamicStruct::Builder(struct_schema, builder_impl.getPointerField(slot_offset).initStruct(struct_size)); + field_builder = std::make_unique(std::move(struct_builder_impl), 1); + } + auto & struct_builder = assert_cast(*field_builder); const auto & entries_column = assert_cast(column.get())->getNestedColumnPtr(); - auto entries_builder = initStructFieldBuilderIfNeeded(entries_column, row_num, struct_builder.impl, entries_field, entries_capnp_type, 0); - nested_serializer->writeRow(entries_column, entries_builder.get(), row_num); - return std::nullopt; + nested_serializer->writeRow(entries_column, struct_builder.field_builders[0], struct_builder.impl, entries_slot_offset, row_num); } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - auto struct_reader = reader.as(); + if (!field_builder) + { + auto builder_impl = parent_list_builder.getBuilderImpl(); + auto struct_builder_impl = capnp::DynamicStruct::Builder(struct_schema, builder_impl.getStructElement(array_index)); + field_builder = std::make_unique(std::move(struct_builder_impl), 1); + } + + auto & struct_builder = assert_cast(*field_builder); + const auto & entries_column = assert_cast(column.get())->getNestedColumnPtr(); + nested_serializer->writeRow(entries_column, struct_builder.field_builders[0], struct_builder.impl, entries_slot_offset, row_num); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + auto reader_impl = parent_struct_reader.getReaderImpl(); + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, reader_impl.getPointerField(slot_offset).getStruct(nullptr)); auto & entries_column = assert_cast(column).getNestedColumn(); - nested_serializer->readRow(entries_column, struct_reader.get(entries_field)); + nested_serializer->readRow(entries_column, struct_reader, entries_slot_offset); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + auto reader_impl = parent_list_reader.getReaderImpl(); + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, reader_impl.getStructElement(array_index)); + auto & entries_column = assert_cast(column).getNestedColumn(); + nested_serializer->readRow(entries_column, struct_reader, entries_slot_offset); } private: std::unique_ptr nested_serializer; - capnp::StructSchema::Field entries_field; - capnp::Type entries_capnp_type; + capnp::StructSchema struct_schema; + capnp::_::StructSize struct_size; + UInt32 entries_slot_offset; }; class CapnProtoStructureSerializer : public ICapnProtoSerializer { public: - CapnProtoStructureSerializer(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) + CapnProtoStructureSerializer(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) : struct_schema(schema) { if (checkIfStructIsNamedUnion(schema) || checkIfStructContainsUnnamedUnion(schema)) throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Root CapnProto Struct cannot be named union/struct with unnamed union"); - initialize(data_types, names, schema, settings); + initialize(data_types, names, settings); } CapnProtoStructureSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) @@ -965,7 +1284,7 @@ namespace if (!capnp_type.isStruct()) throwCannotConvert(data_type, column_name, capnp_type); - auto struct_schema = capnp_type.asStruct(); + struct_schema = capnp_type.asStruct(); if (checkIfStructIsNamedUnion(struct_schema) || checkIfStructContainsUnnamedUnion(struct_schema)) throw Exception( @@ -1002,7 +1321,7 @@ namespace try { - initialize(nested_types, nested_names, struct_schema, settings); + initialize(nested_types, nested_names, settings); } catch (Exception & e) { @@ -1011,77 +1330,118 @@ namespace } } - std::optional writeRow(const ColumnPtr & column, FieldBuilder * builder, size_t row_num) override + void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - assert(builder); - auto & struct_builder = assert_cast(*builder); + if (!field_builder) + { + auto builder_impl = parent_struct_builder.getBuilderImpl(); + auto struct_builder_impl = capnp::DynamicStruct::Builder(struct_schema, builder_impl.getPointerField(slot_offset).initStruct(struct_size)); + field_builder = std::make_unique(std::move(struct_builder_impl), fields_count); + } + + auto & struct_builder = assert_cast(*field_builder); if (const auto * tuple_column = typeid_cast(column.get())) - writeRow(tuple_column->getColumnsCopy(), struct_builder, row_num); + { + const auto & columns = tuple_column->getColumns(); + for (size_t i = 0; i != columns.size(); ++i) + fields_serializers[i]->writeRow(columns[i], struct_builder.field_builders[fields_indexes[i]], struct_builder.impl, fields_offsets[i], row_num); + } else - writeRow(Columns{column}, struct_builder, row_num); - return std::nullopt; + { + fields_serializers[0]->writeRow(column, struct_builder.field_builders[fields_indexes[0]], struct_builder.impl, fields_offsets[0], row_num); + } + } + + void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + if (!field_builder) + { + auto builder_impl = parent_list_builder.getBuilderImpl(); + auto struct_builder_impl = capnp::DynamicStruct::Builder(struct_schema, builder_impl.getStructElement(array_index)); + field_builder = std::make_unique(std::move(struct_builder_impl), fields_count); + } + + auto & struct_builder = assert_cast(*field_builder); + if (const auto * tuple_column = typeid_cast(column.get())) + { + const auto & columns = tuple_column->getColumns(); + for (size_t i = 0; i != columns.size(); ++i) + fields_serializers[i]->writeRow(columns[i], struct_builder.field_builders[fields_indexes[i]], struct_builder.impl, fields_offsets[i], row_num); + } + else + { + fields_serializers[0]->writeRow(column, struct_builder.field_builders[fields_indexes[0]], struct_builder.impl, fields_offsets[0], row_num); + } } void writeRow(const Columns & columns, StructBuilder & struct_builder, size_t row_num) { for (size_t i = 0; i != columns.size(); ++i) - { - const auto & field = fields[i]; - size_t field_index = field.getIndex(); - if (likely(!struct_builder.field_builders[field_index])) - struct_builder.field_builders[field_index] = initStructFieldBuilderIfNeeded( - columns[i], row_num, struct_builder.impl, field, fields_types[i], nested_field_sizes[i]); - - auto value = field_serializers[i]->writeRow(columns[i], struct_builder.field_builders[field_index].get(), row_num); - if (value) - struct_builder.impl.set(field, *value); - } + fields_serializers[i]->writeRow(columns[i], struct_builder.field_builders[fields_indexes[i]], struct_builder.impl, fields_offsets[i], row_num); } - void readRow(IColumn & column, const capnp::DynamicValue::Reader & reader) override + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { - auto struct_reader = reader.as(); + auto reader_impl = parent_struct_reader.getReaderImpl(); + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, reader_impl.getPointerField(slot_offset).getStruct(nullptr)); if (auto * tuple_column = typeid_cast(&column)) { for (size_t i = 0; i != tuple_column->tupleSize(); ++i) - field_serializers[i]->readRow(tuple_column->getColumn(i), struct_reader.get(fields[i])); + fields_serializers[i]->readRow(tuple_column->getColumn(i), struct_reader, fields_offsets[i]); } else - field_serializers[0]->readRow(column, struct_reader.get(fields[0])); + fields_serializers[0]->readRow(column, struct_reader, fields_offsets[0]); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + auto reader_impl = parent_list_reader.getReaderImpl(); + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, reader_impl.getStructElement(array_index)); + if (auto * tuple_column = typeid_cast(&column)) + { + for (size_t i = 0; i != tuple_column->tupleSize(); ++i) + fields_serializers[i]->readRow(tuple_column->getColumn(i), struct_reader, fields_offsets[i]); + } + else + fields_serializers[0]->readRow(column, struct_reader, fields_offsets[0]); } void readRow(MutableColumns & columns, const capnp::DynamicStruct::Reader & reader) { for (size_t i = 0; i != columns.size(); ++i) - field_serializers[i]->readRow(*columns[i], reader.get(fields[i])); + fields_serializers[i]->readRow(*columns[i], reader, fields_offsets[i]); } private: - void initialize(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) + void initialize(const DataTypes & data_types, const Names & names, const FormatSettings::CapnProto & settings) { - field_serializers.reserve(data_types.size()); - fields.reserve(data_types.size()); - fields_types.reserve(data_types.size()); - nested_field_sizes.reserve(data_types.size()); + auto node = struct_schema.getProto().getStruct(); + struct_size = capnp::_::StructSize(node.getDataWordCount(), node.getPointerCount()); + fields_count = struct_schema.getFields().size(); + fields_serializers.reserve(data_types.size()); + fields_offsets.reserve(data_types.size()); + fields_indexes.reserve(data_types.size()); for (size_t i = 0; i != data_types.size(); ++i) { auto [field_name, _] = splitFieldName(names[i]); - auto field = findFieldByName(schema, field_name); + auto field = findFieldByName(struct_schema, field_name); if (!field) throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto schema doesn't contain field with name {}", field_name); - fields.push_back(*field); auto capnp_type = field->getType(); - fields_types.push_back(capnp_type); - nested_field_sizes.push_back(capnp_type.isStruct() ? capnp_type.asStruct().getFields().size() : 0); - field_serializers.push_back(createSerializer(data_types[i], names[i], capnp_type, settings)); + fields_serializers.push_back(createSerializer(data_types[i], names[i], capnp_type, settings)); + fields_offsets.push_back(field->getProto().getSlot().getOffset()); + fields_indexes.push_back(field->getIndex()); } } - std::vector> field_serializers; - std::vector fields; - std::vector nested_field_sizes; - std::vector fields_types; + capnp::StructSchema struct_schema; + capnp::_::StructSize struct_size; + size_t fields_count; + std::vector> fields_serializers; + std::vector fields_offsets; + std::vector fields_indexes; + }; std::unique_ptr createSerializer(const DataTypePtr & type, const String & name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) @@ -1116,17 +1476,17 @@ namespace case TypeIndex::UInt64: return createIntegerSerializer(type, name, capnp_type); case TypeIndex::Int128: - return std::make_unique>(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); case TypeIndex::UInt128: - return std::make_unique>(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); case TypeIndex::Int256: - return std::make_unique>(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); case TypeIndex::UInt256: - return std::make_unique>(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); case TypeIndex::Float32: - return std::make_unique>(type, name, capnp_type); + return createFloatSerializer(type, name, capnp_type); case TypeIndex::Float64: - return std::make_unique>(type, name, capnp_type); + return createFloatSerializer(type, name, capnp_type); case TypeIndex::Date: return std::make_unique(type, name, capnp_type); case TypeIndex::Date32: @@ -1140,15 +1500,15 @@ namespace case TypeIndex::Decimal64: return std::make_unique>(type, name, capnp_type); case TypeIndex::Decimal128: - return std::make_unique>(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); case TypeIndex::Decimal256: - return std::make_unique>(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); case TypeIndex::IPv4: return std::make_unique(type, name, capnp_type); case TypeIndex::IPv6: - return std::make_unique(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); case TypeIndex::UUID: - return std::make_unique(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); case TypeIndex::Enum8: return std::make_unique>(type, name, capnp_type, settings.enum_comparing_mode); case TypeIndex::Enum16: diff --git a/src/Formats/CapnProtoSerializer.h b/src/Formats/CapnProtoSerializer.h index 692f5e5301f..5bdd1a0e554 100644 --- a/src/Formats/CapnProtoSerializer.h +++ b/src/Formats/CapnProtoSerializer.h @@ -4,6 +4,7 @@ #include #include +#include namespace DB { From 36e8f13242edfd83e3650369b6b71bc9e8fc2e64 Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Thu, 25 May 2023 20:10:02 +0000 Subject: [PATCH 0139/1072] Added docs for feature --- docs/en/operations/settings/settings.md | 26 +++++++++++++++++++ docs/en/sql-reference/table-functions/file.md | 5 ++-- docs/ru/operations/settings/settings.md | 23 ++++++++++++++++ docs/ru/sql-reference/table-functions/file.md | 1 + 4 files changed, 53 insertions(+), 2 deletions(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 2239084a429..df9e8eb2fe2 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4200,6 +4200,32 @@ Possible values: Default value: `false`. +## rename_files_after_processing + +- **Type:** String + +- **Default value:** Empty string + +This setting allows to specify renaming pattern for files processed by `file` table function. When option is set, all files read by `file` table function will be renamed according to specified pattern with placeholders, only if files processing was successful. + +### Placeholders + +- `%f` — Original filename without extension (e.g., "sample"). +- `%e` — Original file extension with dot (e.g., ".csv"). +- `%t` — Timestamp (in microseconds). +- `%%` — Percentage sign ("%"). + +### Example +- Option: `--rename_files_after_processing="processed_%f_%t%e"` + +- Query: `SELECT * FROM file('sample.csv')` + + +If reading `sample.csv` is successful, file will be renamed to `processed_sample_1683473210851438.csv` + + + + ## function_json_value_return_type_allow_complex Control whether allow to return complex type (such as: struct, array, map) for json_value function. diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index 28c2dc9f1f3..577e2e6aa1d 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -40,7 +40,7 @@ VALUES (1, 2, 3), (3, 2, 1), (1, 3, 2) As a result, the data is written into the file `test.tsv`: ```bash -# cat /var/lib/clickhouse/user_files/test.tsv +# cat /var/lib/clickhouse/user_files/test.tsv 1 2 3 3 2 1 1 3 2 @@ -163,7 +163,7 @@ Query the number of rows in all files of these two directories: SELECT count(*) FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32'); ``` -:::note +:::note If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. ::: @@ -199,3 +199,4 @@ SELECT count(*) FROM file('big_dir/**/file002', 'CSV', 'name String, value UInt3 **See Also** - [Virtual columns](/docs/en/engines/table-engines/index.md#table_engines-virtual_columns) +- [Rename files after processing](/docs/en/operations/settings/settings.md#rename_files_after_processing) diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index fa3ea582c55..7cab607bb3b 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -4064,3 +4064,26 @@ SELECT sum(number) FROM numbers(10000000000) SETTINGS partial_result_on_first_ca Возможные значения:: `true`, `false` Значение по умолчанию: `false` + +## rename_files_after_processing + +- **Тип:** Строка + +- **Значение по умолчанию:** Пустая строка + +Этот параметр позволяет задать паттерн для переименования файлов, обрабатываемых табличной функцией `file`. Когда опция установлена, все файлы, прочитанные табличной функцией `file`, будут переименованы в соответствии с указанным шаблоном, если обработка и чтение файла завершились успешно. + +### Шаблон +Шаблон поддерживает следующие виды плейсхолдеров: + +- `%f` — Исходное имя файла без расширения (например "sample"). +- `%e` — Оригинальное расширение файла с точкой (например ".csv"). +- `%t` — Текущее время (в микросекундах). +- `%%` — Знак процента ("%"). + +### Пример +- Значение аргумента: `--rename_files_after_processing="processed_%f_%t%e"` + +- Запрос: `SELECT * FROM file('sample.csv')` + +Если чтение и обработка `sample.csv` прошли успешно, файл будет переименован в `processed_sample_1683473210851438.csv`. diff --git a/docs/ru/sql-reference/table-functions/file.md b/docs/ru/sql-reference/table-functions/file.md index 94bc734a8fb..0983c51d954 100644 --- a/docs/ru/sql-reference/table-functions/file.md +++ b/docs/ru/sql-reference/table-functions/file.md @@ -126,3 +126,4 @@ SELECT count(*) FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, **Смотрите также** - [Виртуальные столбцы](index.md#table_engines-virtual_columns) +- [Переименование файлов после обработки](/docs/ru/operations/settings/settings.md#rename_files_after_processing) From 42e1e3ae208ed6488b7f30e6e87c88be38b07b17 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 25 May 2023 20:24:03 +0000 Subject: [PATCH 0140/1072] Fix working with secure socket after async connection --- src/Client/Connection.cpp | 5 ----- src/IO/ReadBufferFromPocoSocket.cpp | 28 +++++++++++++++++++++------- src/IO/ReadBufferFromPocoSocket.h | 2 ++ src/IO/WriteBufferFromPocoSocket.cpp | 27 +++++++++++++++++++++------ src/IO/WriteBufferFromPocoSocket.h | 2 ++ 5 files changed, 46 insertions(+), 18 deletions(-) diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index d39148d3016..451d29d4091 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -138,11 +138,6 @@ void Connection::connect(const ConnectionTimeouts & timeouts) socket->impl()->error(err); // Throws an exception socket->setBlocking(true); - -#if USE_SSL - if (static_cast(secure)) - static_cast(socket.get())->completeHandshake(); -#endif } else { diff --git a/src/IO/ReadBufferFromPocoSocket.cpp b/src/IO/ReadBufferFromPocoSocket.cpp index d0fba2c28e8..4ceba347707 100644 --- a/src/IO/ReadBufferFromPocoSocket.cpp +++ b/src/IO/ReadBufferFromPocoSocket.cpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace ProfileEvents { @@ -49,16 +50,18 @@ bool ReadBufferFromPocoSocket::nextImpl() { CurrentMetrics::Increment metric_increment(CurrentMetrics::NetworkReceive); - /// If async_callback is specified, and read will block, run async_callback and try again later. - /// It is expected that file descriptor may be polled externally. - /// Note that receive timeout is not checked here. External code should check it while polling. - while (async_callback && !socket.poll(0, Poco::Net::Socket::SELECT_READ | Poco::Net::Socket::SELECT_ERROR)) - async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), AsyncEventTimeoutType::RECEIVE, socket_description, AsyncTaskExecutor::Event::READ | AsyncTaskExecutor::Event::ERROR); - if (internal_buffer.size() > INT_MAX) throw Exception(ErrorCodes::LOGICAL_ERROR, "Buffer overflow"); - bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), static_cast(internal_buffer.size())); + bytes_read = readFromSocket(); + + /// In case of non-blocking connect for secure socket receiveBytes can return ERR_SSL_WANT_READ, + /// in this case we should call receiveBytes again when socket is ready. + if (socket.secure()) + { + while (bytes_read == Poco::Net::SecureStreamSocket::ERR_SSL_WANT_READ) + bytes_read = readFromSocket(); + } } catch (const Poco::Net::NetException & e) { @@ -86,6 +89,17 @@ bool ReadBufferFromPocoSocket::nextImpl() return true; } +ssize_t ReadBufferFromPocoSocket::readFromSocket() +{ + /// If async_callback is specified, and read will block, run async_callback and try again later. + /// It is expected that file descriptor may be polled externally. + /// Note that receive timeout is not checked here. External code should check it while polling. + while (async_callback && !socket.poll(0, Poco::Net::Socket::SELECT_READ | Poco::Net::Socket::SELECT_ERROR)) + async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), AsyncEventTimeoutType::RECEIVE, socket_description, AsyncTaskExecutor::Event::READ | AsyncTaskExecutor::Event::ERROR); + + return socket.impl()->receiveBytes(internal_buffer.begin(), static_cast(internal_buffer.size())); +} + ReadBufferFromPocoSocket::ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size) : BufferWithOwnMemory(buf_size) , socket(socket_) diff --git a/src/IO/ReadBufferFromPocoSocket.h b/src/IO/ReadBufferFromPocoSocket.h index dab4ac86295..3c4bc424334 100644 --- a/src/IO/ReadBufferFromPocoSocket.h +++ b/src/IO/ReadBufferFromPocoSocket.h @@ -30,6 +30,8 @@ public: void setAsyncCallback(AsyncCallback async_callback_) { async_callback = std::move(async_callback_); } private: + ssize_t readFromSocket(); + AsyncCallback async_callback; std::string socket_description; }; diff --git a/src/IO/WriteBufferFromPocoSocket.cpp b/src/IO/WriteBufferFromPocoSocket.cpp index 039110dfb62..27fc78e5fe3 100644 --- a/src/IO/WriteBufferFromPocoSocket.cpp +++ b/src/IO/WriteBufferFromPocoSocket.cpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace ProfileEvents @@ -62,13 +63,15 @@ void WriteBufferFromPocoSocket::nextImpl() if (size > INT_MAX) throw Exception(ErrorCodes::LOGICAL_ERROR, "Buffer overflow"); - /// If async_callback is specified, and write will block, run async_callback and try again later. - /// It is expected that file descriptor may be polled externally. - /// Note that send timeout is not checked here. External code should check it while polling. - while (async_callback && !socket.poll(0, Poco::Net::Socket::SELECT_WRITE | Poco::Net::Socket::SELECT_ERROR)) - async_callback(socket.impl()->sockfd(), socket.getSendTimeout(), AsyncEventTimeoutType::SEND, socket_description, AsyncTaskExecutor::Event::WRITE | AsyncTaskExecutor::Event::ERROR); + res = writeToSocket(pos, size); - res = socket.impl()->sendBytes(pos, static_cast(size)); + /// In case of non-blocking connect for secure socket sendBytes can return ERR_SSL_WANT_READ, + /// in this case we should call sendBytes again when socket is ready. + if (socket.secure()) + { + while (res == Poco::Net::SecureStreamSocket::ERR_SSL_WANT_WRITE) + res = writeToSocket(pos, size); + } } catch (const Poco::Net::NetException & e) { @@ -95,6 +98,18 @@ void WriteBufferFromPocoSocket::nextImpl() } } +ssize_t WriteBufferFromPocoSocket::writeToSocket(char * data, size_t size) +{ + /// If async_callback is specified, and write will block, run async_callback and try again later. + /// It is expected that file descriptor may be polled externally. + /// Note that send timeout is not checked here. External code should check it while polling. + while (async_callback && !socket.poll(0, Poco::Net::Socket::SELECT_WRITE | Poco::Net::Socket::SELECT_ERROR)) + async_callback(socket.impl()->sockfd(), socket.getSendTimeout(), AsyncEventTimeoutType::SEND, socket_description, AsyncTaskExecutor::Event::WRITE | AsyncTaskExecutor::Event::ERROR); + + return socket.impl()->sendBytes(data, static_cast(size)); + +} + WriteBufferFromPocoSocket::WriteBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size) : BufferWithOwnMemory(buf_size) , socket(socket_) diff --git a/src/IO/WriteBufferFromPocoSocket.h b/src/IO/WriteBufferFromPocoSocket.h index ecb61020357..0f03e816af5 100644 --- a/src/IO/WriteBufferFromPocoSocket.h +++ b/src/IO/WriteBufferFromPocoSocket.h @@ -35,6 +35,8 @@ protected: Poco::Net::SocketAddress our_address; private: + ssize_t writeToSocket(char * data, size_t size); + AsyncCallback async_callback; std::string socket_description; }; From 1964d1bb7e55c5827837aa4ac083da3c2c8f39db Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 25 May 2023 22:30:16 +0200 Subject: [PATCH 0141/1072] Fix comment --- src/IO/WriteBufferFromPocoSocket.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IO/WriteBufferFromPocoSocket.cpp b/src/IO/WriteBufferFromPocoSocket.cpp index 27fc78e5fe3..ed58e37ddee 100644 --- a/src/IO/WriteBufferFromPocoSocket.cpp +++ b/src/IO/WriteBufferFromPocoSocket.cpp @@ -65,7 +65,7 @@ void WriteBufferFromPocoSocket::nextImpl() res = writeToSocket(pos, size); - /// In case of non-blocking connect for secure socket sendBytes can return ERR_SSL_WANT_READ, + /// In case of non-blocking connect for secure socket sendBytes can return ERR_SSL_WANT_WRITE, /// in this case we should call sendBytes again when socket is ready. if (socket.secure()) { From 613568423d7b34a80d9c5a1688865122f1136a07 Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuk Date: Fri, 26 May 2023 07:49:45 +1000 Subject: [PATCH 0142/1072] Update src/Processors/Formats/Impl/CSVRowInputFormat.cpp Co-authored-by: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> --- src/Processors/Formats/Impl/CSVRowInputFormat.cpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 9922bd41442..6593567a581 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -282,15 +282,7 @@ bool CSVFormatReader::readField( bool is_last_file_column, const String & /*column_name*/) { - if (format_settings.csv.trim_whitespaces) [[likely]] - skipWhitespacesAndTabs(*buf); - else if (type->isNullable()) - { - auto nested_type = typeid_cast(type.get())->getNestedType(); - if (!isStringOrFixedString(nested_type)) - skipWhitespacesAndTabs(*buf); - } - else if (!isStringOrFixedString(type)) + if (format_settings.csv.trim_whitespaces || !isStringOrFixedString(RemoveNullable(type))) [[likely]] skipWhitespacesAndTabs(*buf); const bool at_delimiter = !buf->eof() && *buf->position() == format_settings.csv.delimiter; From a9082b24b40a9b09ce057836b4b5d55bf7bb491b Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 26 May 2023 00:20:20 +0200 Subject: [PATCH 0143/1072] Fix build --- src/IO/ReadBufferFromPocoSocket.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/IO/ReadBufferFromPocoSocket.cpp b/src/IO/ReadBufferFromPocoSocket.cpp index 4ceba347707..11acd3a9136 100644 --- a/src/IO/ReadBufferFromPocoSocket.cpp +++ b/src/IO/ReadBufferFromPocoSocket.cpp @@ -9,7 +9,10 @@ #include #include #include + +#if USE_SSL #include +#endif namespace ProfileEvents { From 67b78829fcc74407418db922f1ef58cfe8e1b6ad Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 26 May 2023 00:21:14 +0200 Subject: [PATCH 0144/1072] Fix build --- src/IO/ReadBufferFromPocoSocket.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/IO/ReadBufferFromPocoSocket.cpp b/src/IO/ReadBufferFromPocoSocket.cpp index 11acd3a9136..c051478afc5 100644 --- a/src/IO/ReadBufferFromPocoSocket.cpp +++ b/src/IO/ReadBufferFromPocoSocket.cpp @@ -58,6 +58,7 @@ bool ReadBufferFromPocoSocket::nextImpl() bytes_read = readFromSocket(); +#if USE_SSL /// In case of non-blocking connect for secure socket receiveBytes can return ERR_SSL_WANT_READ, /// in this case we should call receiveBytes again when socket is ready. if (socket.secure()) @@ -65,6 +66,7 @@ bool ReadBufferFromPocoSocket::nextImpl() while (bytes_read == Poco::Net::SecureStreamSocket::ERR_SSL_WANT_READ) bytes_read = readFromSocket(); } +#endif } catch (const Poco::Net::NetException & e) { From f03ca41b081630061f57d3c3b1b1938c4ddcb759 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 26 May 2023 00:21:46 +0200 Subject: [PATCH 0145/1072] Fix build --- src/IO/WriteBufferFromPocoSocket.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/IO/WriteBufferFromPocoSocket.cpp b/src/IO/WriteBufferFromPocoSocket.cpp index ed58e37ddee..e566c228534 100644 --- a/src/IO/WriteBufferFromPocoSocket.cpp +++ b/src/IO/WriteBufferFromPocoSocket.cpp @@ -10,8 +10,10 @@ #include #include #include -#include +#if USE_SSL +#include +#endif namespace ProfileEvents { @@ -65,6 +67,7 @@ void WriteBufferFromPocoSocket::nextImpl() res = writeToSocket(pos, size); +#if USE_SSL /// In case of non-blocking connect for secure socket sendBytes can return ERR_SSL_WANT_WRITE, /// in this case we should call sendBytes again when socket is ready. if (socket.secure()) @@ -72,6 +75,7 @@ void WriteBufferFromPocoSocket::nextImpl() while (res == Poco::Net::SecureStreamSocket::ERR_SSL_WANT_WRITE) res = writeToSocket(pos, size); } +#endif } catch (const Poco::Net::NetException & e) { From 01f3a46cf06d168452a8cbbca76b1c083355d63b Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Thu, 25 May 2023 22:49:36 +0000 Subject: [PATCH 0146/1072] fixed wrong case in removeNullable --- src/Processors/Formats/Impl/CSVRowInputFormat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 6593567a581..8b4dbbffe1d 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -282,7 +282,7 @@ bool CSVFormatReader::readField( bool is_last_file_column, const String & /*column_name*/) { - if (format_settings.csv.trim_whitespaces || !isStringOrFixedString(RemoveNullable(type))) [[likely]] + if (format_settings.csv.trim_whitespaces || !isStringOrFixedString(removeNullable(type))) [[likely]] skipWhitespacesAndTabs(*buf); const bool at_delimiter = !buf->eof() && *buf->position() == format_settings.csv.delimiter; From 2a2c35e4c11fe1ef245577cb8c1f2cc1db5f9284 Mon Sep 17 00:00:00 2001 From: Aleksei Filatov Date: Thu, 25 May 2023 18:14:16 +0300 Subject: [PATCH 0147/1072] Fix changed IP for https session --- .../Net/include/Poco/Net/HTTPClientSession.h | 3 + src/IO/HTTPCommon.cpp | 29 ++++-- .../configs/listen_host.xml | 3 + .../test_https_replication/test_change_ip.py | 96 +++++++++++++++++++ 4 files changed, 123 insertions(+), 8 deletions(-) create mode 100644 tests/integration/test_https_replication/configs/listen_host.xml create mode 100644 tests/integration/test_https_replication/test_change_ip.py diff --git a/base/poco/Net/include/Poco/Net/HTTPClientSession.h b/base/poco/Net/include/Poco/Net/HTTPClientSession.h index 6f67918d2c8..d495d662f75 100644 --- a/base/poco/Net/include/Poco/Net/HTTPClientSession.h +++ b/base/poco/Net/include/Poco/Net/HTTPClientSession.h @@ -127,6 +127,9 @@ namespace Net void setResolvedHost(std::string resolved_host) { _resolved_host.swap(resolved_host); } + std::string getResolvedHost() const { return _resolved_host; } + /// Returns the resolved IP address of the target HTTP server. + Poco::UInt16 getPort() const; /// Returns the port number of the target HTTP server. diff --git a/src/IO/HTTPCommon.cpp b/src/IO/HTTPCommon.cpp index 4bea646a42b..3ec9b3d0a83 100644 --- a/src/IO/HTTPCommon.cpp +++ b/src/IO/HTTPCommon.cpp @@ -68,7 +68,8 @@ namespace if (https) { #if USE_SSL - String resolved_host = resolve_host ? DNSResolver::instance().resolveHost(host).toString() : host; + /// Cannot resolve host in advance, otherwise SNI won't work in Poco. + /// For more information about SNI, see the https://en.wikipedia.org/wiki/Server_Name_Indication auto https_session = std::make_shared(host, port); if (resolve_host) https_session->setResolvedHost(DNSResolver::instance().resolveHost(host).toString()); @@ -184,6 +185,24 @@ namespace std::mutex mutex; std::unordered_map endpoints_pool; + void updateHostIfIpChanged(Entry & session, const String & new_ip) + { + const auto old_ip = session->getResolvedHost().empty() ? session->getHost() : session->getResolvedHost(); + + if (new_ip != old_ip) + { + session->reset(); + if (session->getResolvedHost().empty()) + { + session->setHost(new_ip); + } + else + { + session->setResolvedHost(new_ip); + } + } + } + protected: HTTPSessionPool() = default; @@ -238,13 +257,7 @@ namespace if (resolve_host) { - /// Host can change IP - const auto ip = DNSResolver::instance().resolveHost(host).toString(); - if (ip != session->getHost()) - { - session->reset(); - session->setHost(ip); - } + updateHostIfIpChanged(session, DNSResolver::instance().resolveHost(host).toString()); } } /// Reset the message, once it has been printed, diff --git a/tests/integration/test_https_replication/configs/listen_host.xml b/tests/integration/test_https_replication/configs/listen_host.xml new file mode 100644 index 00000000000..f94e5c88568 --- /dev/null +++ b/tests/integration/test_https_replication/configs/listen_host.xml @@ -0,0 +1,3 @@ + + :: + diff --git a/tests/integration/test_https_replication/test_change_ip.py b/tests/integration/test_https_replication/test_change_ip.py new file mode 100644 index 00000000000..14fe5351c8d --- /dev/null +++ b/tests/integration/test_https_replication/test_change_ip.py @@ -0,0 +1,96 @@ +import pytest +from helpers.cluster import ClickHouseCluster +from helpers.test_tools import assert_eq_with_retry + +""" +Both ssl_conf.xml and no_ssl_conf.xml have the same port +""" + + +def _fill_nodes(nodes, shard): + for node in nodes: + node.query( + """ + CREATE DATABASE test; + + CREATE TABLE test_table(date Date, id UInt32) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/test{shard}/replicated', '{replica}') PARTITION BY toYYYYMM(date) ORDER BY id; + """.format( + shard=shard, replica=node.name + ) + ) + + +cluster = ClickHouseCluster(__file__) +node1 = cluster.add_instance( + "node1", + main_configs=[ + "configs/remote_servers.xml", + "configs/listen_host.xml", + "configs/ssl_conf.xml", + "configs/server.crt", + "configs/server.key", + "configs/dhparam.pem", + ], + with_zookeeper=True, + ipv6_address="2001:3984:3989::1:1111", +) +node2 = cluster.add_instance( + "node2", + main_configs=[ + "configs/remote_servers.xml", + "configs/listen_host.xml", + "configs/ssl_conf.xml", + "configs/server.crt", + "configs/server.key", + "configs/dhparam.pem", + ], + with_zookeeper=True, + ipv6_address="2001:3984:3989::1:1112", +) + + +@pytest.fixture(scope="module") +def both_https_cluster(): + try: + cluster.start() + + _fill_nodes([node1, node2], 1) + + yield cluster + + finally: + cluster.shutdown() + + +def test_replication_when_node_ip_changed(both_https_cluster): + """ + Test for a bug when replication over HTTPS stops working when the IP of the source replica was changed. + + node1 is a source node + node2 fethes data from node1 + """ + node1.query("truncate table test_table") + node2.query("truncate table test_table") + + # First we check, that normal replication works + node1.query( + "INSERT INTO test_table VALUES ('2022-10-01', 1), ('2022-10-02', 2), ('2022-10-03', 3)" + ) + assert node1.query("SELECT count(*) from test_table") == "3\n" + assert_eq_with_retry(node2, "SELECT count(*) from test_table", "3") + + # We change source node ip + cluster.restart_instance_with_ip_change(node1, "2001:3984:3989::1:7777") + + # Put some data to source node1 + node1.query( + "INSERT INTO test_table VALUES ('2018-10-01', 4), ('2018-10-02', 4), ('2018-10-03', 6)" + ) + # Check that data is placed on node1 + assert node1.query("SELECT count(*) from test_table") == "6\n" + + # drop DNS cache + node2.query("SYSTEM DROP DNS CACHE") + # Data is fetched + assert_eq_with_retry(node2, "SELECT count(*) from test_table", "6") From 03652efe58469351f87e87c0ca47a6789776710f Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 26 May 2023 12:44:28 +0200 Subject: [PATCH 0148/1072] Add missing include --- src/IO/ReadBufferFromPocoSocket.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/IO/ReadBufferFromPocoSocket.cpp b/src/IO/ReadBufferFromPocoSocket.cpp index c051478afc5..d6790439683 100644 --- a/src/IO/ReadBufferFromPocoSocket.cpp +++ b/src/IO/ReadBufferFromPocoSocket.cpp @@ -10,6 +10,8 @@ #include #include +#include "config.h" + #if USE_SSL #include #endif From 67c8c5c561668f0fca8d5ab2545d2ee82178fbdb Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 26 May 2023 12:44:43 +0200 Subject: [PATCH 0149/1072] Add missing include --- src/IO/WriteBufferFromPocoSocket.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/IO/WriteBufferFromPocoSocket.cpp b/src/IO/WriteBufferFromPocoSocket.cpp index e566c228534..6e7c67cc054 100644 --- a/src/IO/WriteBufferFromPocoSocket.cpp +++ b/src/IO/WriteBufferFromPocoSocket.cpp @@ -11,6 +11,8 @@ #include #include +#include "config.h" + #if USE_SSL #include #endif From ef9bae50b9cc83a885a6e0f8c0d82a88ee2c791b Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 26 May 2023 23:11:57 +0000 Subject: [PATCH 0150/1072] Fix bugs in Poco, use true non-blocking IO --- base/poco/Net/src/SocketImpl.cpp | 10 +++- .../include/Poco/Net/SecureSocketImpl.h | 10 ++++ .../include/Poco/Net/SecureStreamSocketImpl.h | 10 ++++ .../NetSSL_OpenSSL/src/SecureSocketImpl.cpp | 10 ++++ .../src/SecureStreamSocketImpl.cpp | 10 ++++ src/Common/checkSSLError.h | 8 +++ src/Common/checkSSLReturnCode.cpp | 29 ++++++++++ src/Common/checkSSLReturnCode.h | 12 ++++ src/IO/ReadBufferFromPocoSocket.cpp | 58 +++++++++---------- src/IO/ReadBufferFromPocoSocket.h | 2 - src/IO/WriteBufferFromPocoSocket.cpp | 58 +++++++++---------- src/IO/WriteBufferFromPocoSocket.h | 2 - 12 files changed, 154 insertions(+), 65 deletions(-) create mode 100644 src/Common/checkSSLError.h create mode 100644 src/Common/checkSSLReturnCode.cpp create mode 100644 src/Common/checkSSLReturnCode.h diff --git a/base/poco/Net/src/SocketImpl.cpp b/base/poco/Net/src/SocketImpl.cpp index 2aba413b322..484b8cfeec3 100644 --- a/base/poco/Net/src/SocketImpl.cpp +++ b/base/poco/Net/src/SocketImpl.cpp @@ -274,7 +274,9 @@ void SocketImpl::shutdown() int SocketImpl::sendBytes(const void* buffer, int length, int flags) { - if (_isBrokenTimeout) + bool blocking = _blocking && (flags & MSG_DONTWAIT) == 0; + + if (_isBrokenTimeout && blocking) { if (_sndTimeout.totalMicroseconds() != 0) { @@ -289,11 +291,13 @@ int SocketImpl::sendBytes(const void* buffer, int length, int flags) if (_sockfd == POCO_INVALID_SOCKET) throw InvalidSocketException(); rc = ::send(_sockfd, reinterpret_cast(buffer), length, flags); } - while (_blocking && rc < 0 && lastError() == POCO_EINTR); + while (blocking && rc < 0 && lastError() == POCO_EINTR); if (rc < 0) { int err = lastError(); - if (err == POCO_EAGAIN || err == POCO_ETIMEDOUT) + if ((err == POCO_EAGAIN || err == POCO_EWOULDBLOCK) && !blocking) + ; + else if (err == POCO_EAGAIN || err == POCO_ETIMEDOUT) throw TimeoutException(); else error(err); diff --git a/base/poco/NetSSL_OpenSSL/include/Poco/Net/SecureSocketImpl.h b/base/poco/NetSSL_OpenSSL/include/Poco/Net/SecureSocketImpl.h index 56c550decfe..49c12b6b45f 100644 --- a/base/poco/NetSSL_OpenSSL/include/Poco/Net/SecureSocketImpl.h +++ b/base/poco/NetSSL_OpenSSL/include/Poco/Net/SecureSocketImpl.h @@ -183,6 +183,16 @@ namespace Net /// Returns true iff a reused session was negotiated during /// the handshake. + virtual void setBlocking(bool flag); + /// Sets the socket in blocking mode if flag is true, + /// disables blocking mode if flag is false. + + virtual bool getBlocking() const; + /// Returns the blocking mode of the socket. + /// This method will only work if the blocking modes of + /// the socket are changed via the setBlocking method! + + protected: void acceptSSL(); /// Assume per-object mutex is locked. diff --git a/base/poco/NetSSL_OpenSSL/include/Poco/Net/SecureStreamSocketImpl.h b/base/poco/NetSSL_OpenSSL/include/Poco/Net/SecureStreamSocketImpl.h index b41043769fe..99e2130d673 100644 --- a/base/poco/NetSSL_OpenSSL/include/Poco/Net/SecureStreamSocketImpl.h +++ b/base/poco/NetSSL_OpenSSL/include/Poco/Net/SecureStreamSocketImpl.h @@ -201,6 +201,16 @@ namespace Net /// Returns true iff a reused session was negotiated during /// the handshake. + virtual void setBlocking(bool flag); + /// Sets the socket in blocking mode if flag is true, + /// disables blocking mode if flag is false. + + virtual bool getBlocking() const; + /// Returns the blocking mode of the socket. + /// This method will only work if the blocking modes of + /// the socket are changed via the setBlocking method! + + protected: void acceptSSL(); /// Performs a SSL server-side handshake. diff --git a/base/poco/NetSSL_OpenSSL/src/SecureSocketImpl.cpp b/base/poco/NetSSL_OpenSSL/src/SecureSocketImpl.cpp index 9631c7a401a..efe25f65909 100644 --- a/base/poco/NetSSL_OpenSSL/src/SecureSocketImpl.cpp +++ b/base/poco/NetSSL_OpenSSL/src/SecureSocketImpl.cpp @@ -629,5 +629,15 @@ bool SecureSocketImpl::sessionWasReused() return false; } +void SecureSocketImpl::setBlocking(bool flag) +{ + _pSocket->setBlocking(flag); +} + +bool SecureSocketImpl::getBlocking() const +{ + return _pSocket->getBlocking(); +} + } } // namespace Poco::Net diff --git a/base/poco/NetSSL_OpenSSL/src/SecureStreamSocketImpl.cpp b/base/poco/NetSSL_OpenSSL/src/SecureStreamSocketImpl.cpp index aa1a96e1585..c00dd43b2ed 100644 --- a/base/poco/NetSSL_OpenSSL/src/SecureStreamSocketImpl.cpp +++ b/base/poco/NetSSL_OpenSSL/src/SecureStreamSocketImpl.cpp @@ -237,5 +237,15 @@ int SecureStreamSocketImpl::completeHandshake() return _impl.completeHandshake(); } +bool SecureStreamSocketImpl::getBlocking() const +{ + return _impl.getBlocking(); +} + +void SecureStreamSocketImpl::setBlocking(bool flag) +{ + _impl.setBlocking(flag); +} + } } // namespace Poco::Net diff --git a/src/Common/checkSSLError.h b/src/Common/checkSSLError.h new file mode 100644 index 00000000000..05bca9f8b5f --- /dev/null +++ b/src/Common/checkSSLError.h @@ -0,0 +1,8 @@ +// +// Created by Павел Круглов on 27/05/2023. +// + +#ifndef CLICKHOUSE_CHECKSSLERROR_H +#define CLICKHOUSE_CHECKSSLERROR_H + +#endif //CLICKHOUSE_CHECKSSLERROR_H diff --git a/src/Common/checkSSLReturnCode.cpp b/src/Common/checkSSLReturnCode.cpp new file mode 100644 index 00000000000..8916a25e19c --- /dev/null +++ b/src/Common/checkSSLReturnCode.cpp @@ -0,0 +1,29 @@ +#include +#include "config.h" + +#if USE_SSL +#include +#endif + +namespace DB +{ + +bool checkSSLWantRead(ssize_t res) +{ +#if USE_SSL + return res == Poco::Net::SecureStreamSocket::ERR_SSL_WANT_READ; +#else + return false; +#endif +} + +bool checkSSLWantWrite(ssize_t res) +{ +#if USE_SSL + return res == Poco::Net::SecureStreamSocket::ERR_SSL_WANT_WRITE; +#else + return false; +#endif +} + +} diff --git a/src/Common/checkSSLReturnCode.h b/src/Common/checkSSLReturnCode.h new file mode 100644 index 00000000000..f30564137aa --- /dev/null +++ b/src/Common/checkSSLReturnCode.h @@ -0,0 +1,12 @@ +#pragma once + +namespace DB +{ + +/// Check if ret is ERR_SSL_WANT_READ. +bool checkSSLWantRead(ssize_t ret); + +/// CHeck if ret is ERR_SSL_WANT_WRITE. +bool checkSSLWantWrite(ssize_t ret); + +} diff --git a/src/IO/ReadBufferFromPocoSocket.cpp b/src/IO/ReadBufferFromPocoSocket.cpp index d6790439683..ff72dc5386c 100644 --- a/src/IO/ReadBufferFromPocoSocket.cpp +++ b/src/IO/ReadBufferFromPocoSocket.cpp @@ -9,12 +9,7 @@ #include #include #include - -#include "config.h" - -#if USE_SSL -#include -#endif +#include namespace ProfileEvents { @@ -27,7 +22,6 @@ namespace CurrentMetrics extern const Metric NetworkReceive; } - namespace DB { namespace ErrorCodes @@ -38,14 +32,13 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } - bool ReadBufferFromPocoSocket::nextImpl() { ssize_t bytes_read = 0; Stopwatch watch; SCOPE_EXIT({ - // / NOTE: it is quite inaccurate on high loads since the thread could be replaced by another one + /// NOTE: it is quite inaccurate on high loads since the thread could be replaced by another one ProfileEvents::increment(ProfileEvents::NetworkReceiveElapsedMicroseconds, watch.elapsedMicroseconds()); ProfileEvents::increment(ProfileEvents::NetworkReceiveBytes, bytes_read); }); @@ -58,17 +51,35 @@ bool ReadBufferFromPocoSocket::nextImpl() if (internal_buffer.size() > INT_MAX) throw Exception(ErrorCodes::LOGICAL_ERROR, "Buffer overflow"); - bytes_read = readFromSocket(); - -#if USE_SSL - /// In case of non-blocking connect for secure socket receiveBytes can return ERR_SSL_WANT_READ, - /// in this case we should call receiveBytes again when socket is ready. - if (socket.secure()) + /// If async_callback is specified, set socket to non-blocking mode + /// and try to read data from it, if socket is not ready for reading, + /// run async_callback and try again later. + /// It is expected that file descriptor may be polled externally. + /// Note that send timeout is not checked here. External code should check it while polling. + if (async_callback) { - while (bytes_read == Poco::Net::SecureStreamSocket::ERR_SSL_WANT_READ) - bytes_read = readFromSocket(); + socket.setBlocking(false); + SCOPE_EXIT(socket.setBlocking(true)); + bool secure = socket.secure(); + bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), static_cast(internal_buffer.size())); + + /// Check EAGAIN and ERR_SSL_WANT_READ/ERR_SSL_WANT_WRITE for secure socket (reading from secure socket can write too). + while (bytes_read < 0 && (errno == EAGAIN || (secure && (checkSSLWantRead(bytes_read) || checkSSLWantWrite(bytes_read))))) + { + /// In case of ERR_SSL_WANT_WRITE we should wait for socket to be ready for writing, otherwise - for reading. + if (secure && checkSSLWantWrite(bytes_read)) + async_callback(socket.impl()->sockfd(), socket.getSendTimeout(), AsyncEventTimeoutType::SEND, socket_description, AsyncTaskExecutor::Event::WRITE | AsyncTaskExecutor::Event::ERROR); + else + async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), AsyncEventTimeoutType::RECEIVE, socket_description, AsyncTaskExecutor::Event::READ | AsyncTaskExecutor::Event::ERROR); + + /// Try to read again. + bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), static_cast(internal_buffer.size())); + } + } + else + { + bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), static_cast(internal_buffer.size())); } -#endif } catch (const Poco::Net::NetException & e) { @@ -96,17 +107,6 @@ bool ReadBufferFromPocoSocket::nextImpl() return true; } -ssize_t ReadBufferFromPocoSocket::readFromSocket() -{ - /// If async_callback is specified, and read will block, run async_callback and try again later. - /// It is expected that file descriptor may be polled externally. - /// Note that receive timeout is not checked here. External code should check it while polling. - while (async_callback && !socket.poll(0, Poco::Net::Socket::SELECT_READ | Poco::Net::Socket::SELECT_ERROR)) - async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), AsyncEventTimeoutType::RECEIVE, socket_description, AsyncTaskExecutor::Event::READ | AsyncTaskExecutor::Event::ERROR); - - return socket.impl()->receiveBytes(internal_buffer.begin(), static_cast(internal_buffer.size())); -} - ReadBufferFromPocoSocket::ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size) : BufferWithOwnMemory(buf_size) , socket(socket_) diff --git a/src/IO/ReadBufferFromPocoSocket.h b/src/IO/ReadBufferFromPocoSocket.h index 3c4bc424334..dab4ac86295 100644 --- a/src/IO/ReadBufferFromPocoSocket.h +++ b/src/IO/ReadBufferFromPocoSocket.h @@ -30,8 +30,6 @@ public: void setAsyncCallback(AsyncCallback async_callback_) { async_callback = std::move(async_callback_); } private: - ssize_t readFromSocket(); - AsyncCallback async_callback; std::string socket_description; }; diff --git a/src/IO/WriteBufferFromPocoSocket.cpp b/src/IO/WriteBufferFromPocoSocket.cpp index 6e7c67cc054..df1041f0056 100644 --- a/src/IO/WriteBufferFromPocoSocket.cpp +++ b/src/IO/WriteBufferFromPocoSocket.cpp @@ -10,12 +10,7 @@ #include #include #include - -#include "config.h" - -#if USE_SSL -#include -#endif +#include namespace ProfileEvents { @@ -28,7 +23,6 @@ namespace CurrentMetrics extern const Metric NetworkSend; } - namespace DB { @@ -40,7 +34,6 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } - void WriteBufferFromPocoSocket::nextImpl() { if (!offset()) @@ -67,17 +60,36 @@ void WriteBufferFromPocoSocket::nextImpl() if (size > INT_MAX) throw Exception(ErrorCodes::LOGICAL_ERROR, "Buffer overflow"); - res = writeToSocket(pos, size); - -#if USE_SSL - /// In case of non-blocking connect for secure socket sendBytes can return ERR_SSL_WANT_WRITE, - /// in this case we should call sendBytes again when socket is ready. - if (socket.secure()) + /// If async_callback is specified, set socket to non-blocking mode + /// and try to write data to it, if socket is not ready for writing, + /// run async_callback and try again later. + /// It is expected that file descriptor may be polled externally. + /// Note that send timeout is not checked here. External code should check it while polling. + if (async_callback) { - while (res == Poco::Net::SecureStreamSocket::ERR_SSL_WANT_WRITE) - res = writeToSocket(pos, size); + socket.setBlocking(false); + /// Set socket to blocking mode at the end. + SCOPE_EXIT(socket.setBlocking(true)); + bool secure = socket.secure(); + res = socket.impl()->sendBytes(pos, static_cast(size)); + + /// Check EAGAIN and ERR_SSL_WANT_WRITE/ERR_SSL_WANT_READ for secure socket (writing to secure socket can read too). + while (res < 0 && (errno == EAGAIN || (secure && (checkSSLWantRead(res) || checkSSLWantWrite(res))))) + { + /// In case of ERR_SSL_WANT_READ we should wait for socket to be ready for reading, otherwise - for writing. + if (secure && checkSSLWantRead(res)) + async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), AsyncEventTimeoutType::RECEIVE, socket_description, AsyncTaskExecutor::Event::READ | AsyncTaskExecutor::Event::ERROR); + else + async_callback(socket.impl()->sockfd(), socket.getSendTimeout(), AsyncEventTimeoutType::SEND, socket_description, AsyncTaskExecutor::Event::WRITE | AsyncTaskExecutor::Event::ERROR); + + /// Try to write again. + res = socket.impl()->sendBytes(pos, static_cast(size)); + } + } + else + { + res = socket.impl()->sendBytes(pos, static_cast(size)); } -#endif } catch (const Poco::Net::NetException & e) { @@ -104,18 +116,6 @@ void WriteBufferFromPocoSocket::nextImpl() } } -ssize_t WriteBufferFromPocoSocket::writeToSocket(char * data, size_t size) -{ - /// If async_callback is specified, and write will block, run async_callback and try again later. - /// It is expected that file descriptor may be polled externally. - /// Note that send timeout is not checked here. External code should check it while polling. - while (async_callback && !socket.poll(0, Poco::Net::Socket::SELECT_WRITE | Poco::Net::Socket::SELECT_ERROR)) - async_callback(socket.impl()->sockfd(), socket.getSendTimeout(), AsyncEventTimeoutType::SEND, socket_description, AsyncTaskExecutor::Event::WRITE | AsyncTaskExecutor::Event::ERROR); - - return socket.impl()->sendBytes(data, static_cast(size)); - -} - WriteBufferFromPocoSocket::WriteBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size) : BufferWithOwnMemory(buf_size) , socket(socket_) diff --git a/src/IO/WriteBufferFromPocoSocket.h b/src/IO/WriteBufferFromPocoSocket.h index 0f03e816af5..ecb61020357 100644 --- a/src/IO/WriteBufferFromPocoSocket.h +++ b/src/IO/WriteBufferFromPocoSocket.h @@ -35,8 +35,6 @@ protected: Poco::Net::SocketAddress our_address; private: - ssize_t writeToSocket(char * data, size_t size); - AsyncCallback async_callback; std::string socket_description; }; From 806b7fc18b0e526f39f6b9f04faa2ee90d4136ee Mon Sep 17 00:00:00 2001 From: Lucas Chang Date: Sat, 27 May 2023 17:39:21 +0800 Subject: [PATCH 0151/1072] fix datetime64 index querying --- src/Interpreters/convertFieldToType.cpp | 8 ++++++++ .../1_stateful/00178_query_datetime64_index.reference | 1 + tests/queries/1_stateful/00178_query_datetime64_index.sql | 8 ++++++++ 3 files changed, 17 insertions(+) create mode 100644 tests/queries/1_stateful/00178_query_datetime64_index.reference create mode 100644 tests/queries/1_stateful/00178_query_datetime64_index.sql diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index dc61e748db6..9d82cade814 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -192,6 +192,14 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID { return static_cast(type).getTimeZone().fromDayNum(DayNum(src.get())); } + else if(which_type.isDateTime64() && which_from_type.isDate()) { + const auto value = static_cast(type).getTimeZone().fromDayNum(DayNum(src.get())); + return DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, 1); + } + else if(which_type.isDateTime64() && which_from_type.isDate()) { + const auto value = static_cast(type).getTimeZone().fromDayNum(DayNum(src.get())); + return DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, 1); + } else if (type.isValueRepresentedByNumber() && src.getType() != Field::Types::String) { if (which_type.isUInt8()) return convertNumericType(src, type); diff --git a/tests/queries/1_stateful/00178_query_datetime64_index.reference b/tests/queries/1_stateful/00178_query_datetime64_index.reference new file mode 100644 index 00000000000..3d26cfc1d66 --- /dev/null +++ b/tests/queries/1_stateful/00178_query_datetime64_index.reference @@ -0,0 +1 @@ +2023-05-27 00:00:00.000 diff --git a/tests/queries/1_stateful/00178_query_datetime64_index.sql b/tests/queries/1_stateful/00178_query_datetime64_index.sql new file mode 100644 index 00000000000..0e46401c2d3 --- /dev/null +++ b/tests/queries/1_stateful/00178_query_datetime64_index.sql @@ -0,0 +1,8 @@ +DROP TABLE IF EXISTS datetime64_index_tbl; + +CREATE TABLE datetime64_index_tbl(ts DateTime64(3, 'UTC')) ENGINE=MergeTree ORDER BY ts; +INSERT INTO datetime64_index_tbl(ts) VALUES(toDateTime64('2023-05-27 00:00:00', 3, 'UTC')); + +SELECT ts from datetime64_index_tbl where ts < toDate('2023-05-28'); + +DROP TABLE datetime64_index_tbl; \ No newline at end of file From 41c599b86252d7219adc8e0adc455d93d5e7a5ee Mon Sep 17 00:00:00 2001 From: Lucas Chang Date: Sat, 27 May 2023 17:59:41 +0800 Subject: [PATCH 0152/1072] fix date32 --- src/Interpreters/convertFieldToType.cpp | 2 +- tests/queries/1_stateful/00178_query_datetime64_index.reference | 1 + tests/queries/1_stateful/00178_query_datetime64_index.sql | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index 9d82cade814..9bea9f6851f 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -196,7 +196,7 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID const auto value = static_cast(type).getTimeZone().fromDayNum(DayNum(src.get())); return DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, 1); } - else if(which_type.isDateTime64() && which_from_type.isDate()) { + else if(which_type.isDateTime64() && which_from_type.isDate32()) { const auto value = static_cast(type).getTimeZone().fromDayNum(DayNum(src.get())); return DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, 1); } diff --git a/tests/queries/1_stateful/00178_query_datetime64_index.reference b/tests/queries/1_stateful/00178_query_datetime64_index.reference index 3d26cfc1d66..f24667b36da 100644 --- a/tests/queries/1_stateful/00178_query_datetime64_index.reference +++ b/tests/queries/1_stateful/00178_query_datetime64_index.reference @@ -1 +1,2 @@ 2023-05-27 00:00:00.000 +2023-05-27 00:00:00.000 diff --git a/tests/queries/1_stateful/00178_query_datetime64_index.sql b/tests/queries/1_stateful/00178_query_datetime64_index.sql index 0e46401c2d3..1e1d1561af7 100644 --- a/tests/queries/1_stateful/00178_query_datetime64_index.sql +++ b/tests/queries/1_stateful/00178_query_datetime64_index.sql @@ -4,5 +4,6 @@ CREATE TABLE datetime64_index_tbl(ts DateTime64(3, 'UTC')) ENGINE=MergeTree ORDE INSERT INTO datetime64_index_tbl(ts) VALUES(toDateTime64('2023-05-27 00:00:00', 3, 'UTC')); SELECT ts from datetime64_index_tbl where ts < toDate('2023-05-28'); +SELECT ts from datetime64_index_tbl where ts < toDate32('2023-05-28'); DROP TABLE datetime64_index_tbl; \ No newline at end of file From a8122861213071026430af636801587679b3e710 Mon Sep 17 00:00:00 2001 From: Lucas Chang Date: Sat, 27 May 2023 18:04:19 +0800 Subject: [PATCH 0153/1072] new line eof --- tests/queries/1_stateful/00178_query_datetime64_index.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/1_stateful/00178_query_datetime64_index.sql b/tests/queries/1_stateful/00178_query_datetime64_index.sql index 1e1d1561af7..68542bc0524 100644 --- a/tests/queries/1_stateful/00178_query_datetime64_index.sql +++ b/tests/queries/1_stateful/00178_query_datetime64_index.sql @@ -6,4 +6,4 @@ INSERT INTO datetime64_index_tbl(ts) VALUES(toDateTime64('2023-05-27 00:00:00', SELECT ts from datetime64_index_tbl where ts < toDate('2023-05-28'); SELECT ts from datetime64_index_tbl where ts < toDate32('2023-05-28'); -DROP TABLE datetime64_index_tbl; \ No newline at end of file +DROP TABLE datetime64_index_tbl; From 93415789d7e69c2874171941f56eb92a6bc80bbf Mon Sep 17 00:00:00 2001 From: Lucas Chang Date: Sun, 28 May 2023 10:40:34 +0800 Subject: [PATCH 0154/1072] format --- src/Interpreters/convertFieldToType.cpp | 4 ++-- tests/queries/1_stateful/00178_query_datetime64_index.sql | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index 9bea9f6851f..ba02eefabc9 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -192,11 +192,11 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID { return static_cast(type).getTimeZone().fromDayNum(DayNum(src.get())); } - else if(which_type.isDateTime64() && which_from_type.isDate()) { + else if (which_type.isDateTime64() && which_from_type.isDate()) { const auto value = static_cast(type).getTimeZone().fromDayNum(DayNum(src.get())); return DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, 1); } - else if(which_type.isDateTime64() && which_from_type.isDate32()) { + else if (which_type.isDateTime64() && which_from_type.isDate32()) { const auto value = static_cast(type).getTimeZone().fromDayNum(DayNum(src.get())); return DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, 1); } diff --git a/tests/queries/1_stateful/00178_query_datetime64_index.sql b/tests/queries/1_stateful/00178_query_datetime64_index.sql index 68542bc0524..a3fb594db73 100644 --- a/tests/queries/1_stateful/00178_query_datetime64_index.sql +++ b/tests/queries/1_stateful/00178_query_datetime64_index.sql @@ -3,7 +3,7 @@ DROP TABLE IF EXISTS datetime64_index_tbl; CREATE TABLE datetime64_index_tbl(ts DateTime64(3, 'UTC')) ENGINE=MergeTree ORDER BY ts; INSERT INTO datetime64_index_tbl(ts) VALUES(toDateTime64('2023-05-27 00:00:00', 3, 'UTC')); -SELECT ts from datetime64_index_tbl where ts < toDate('2023-05-28'); -SELECT ts from datetime64_index_tbl where ts < toDate32('2023-05-28'); +SELECT ts FROM datetime64_index_tbl WHERE ts < toDate('2023-05-28'); +SELECT ts FROM datetime64_index_tbl WHERE ts < toDate32('2023-05-28'); DROP TABLE datetime64_index_tbl; From 0d873d50a33d4837d4010be0385df23c8be9b204 Mon Sep 17 00:00:00 2001 From: Lucas Chang Date: Sun, 28 May 2023 10:49:36 +0800 Subject: [PATCH 0155/1072] format --- src/Interpreters/convertFieldToType.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index ba02eefabc9..79251d944c8 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -192,11 +192,13 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID { return static_cast(type).getTimeZone().fromDayNum(DayNum(src.get())); } - else if (which_type.isDateTime64() && which_from_type.isDate()) { + else if (which_type.isDateTime64() && which_from_type.isDate()) + { const auto value = static_cast(type).getTimeZone().fromDayNum(DayNum(src.get())); return DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, 1); } - else if (which_type.isDateTime64() && which_from_type.isDate32()) { + else if (which_type.isDateTime64() && which_from_type.isDate32()) + { const auto value = static_cast(type).getTimeZone().fromDayNum(DayNum(src.get())); return DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, 1); } From 65c34e73e7010bfb5c6298ed2a5bedcc434e7aa7 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Sun, 28 May 2023 15:45:40 +0200 Subject: [PATCH 0156/1072] Clean --- src/Common/checkSSLError.h | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 src/Common/checkSSLError.h diff --git a/src/Common/checkSSLError.h b/src/Common/checkSSLError.h deleted file mode 100644 index 05bca9f8b5f..00000000000 --- a/src/Common/checkSSLError.h +++ /dev/null @@ -1,8 +0,0 @@ -// -// Created by Павел Круглов on 27/05/2023. -// - -#ifndef CLICKHOUSE_CHECKSSLERROR_H -#define CLICKHOUSE_CHECKSSLERROR_H - -#endif //CLICKHOUSE_CHECKSSLERROR_H From 3aa795355f946e6638804124b03d0724bf1ba37b Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Sun, 28 May 2023 15:46:32 +0200 Subject: [PATCH 0157/1072] Fix build --- src/Common/checkSSLReturnCode.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Common/checkSSLReturnCode.h b/src/Common/checkSSLReturnCode.h index f30564137aa..77e51625daf 100644 --- a/src/Common/checkSSLReturnCode.h +++ b/src/Common/checkSSLReturnCode.h @@ -1,4 +1,5 @@ #pragma once +#include namespace DB { From be62c8faba774c8c594a6d0368efba0c7a540576 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Mon, 29 May 2023 15:27:39 +0200 Subject: [PATCH 0158/1072] fix build --- src/Storages/MergeTree/MergeTreeReadPool.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index 05e6dad157b..ba8c2c6385f 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -82,7 +82,7 @@ MergeTreeReadPool::MergeTreeReadPool( const auto min_bytes_per_task = settings.merge_tree_min_bytes_per_task_for_remote_reading; const auto avg_mark_bytes = std::max(total_compressed_bytes / total_marks, 1); /// We're taking min here because number of tasks shouldn't be too low - it will make task stealing impossible. - const auto heuristic_min_marks = std::min(total_marks / threads_, min_bytes_per_task / avg_mark_bytes); + const auto heuristic_min_marks = std::min(total_marks / threads_, min_bytes_per_task / avg_mark_bytes); if (heuristic_min_marks > min_marks_for_concurrent_read) { min_marks_for_concurrent_read = heuristic_min_marks; From 437880d4c15a5cc9e3a541db58939ee7c507e10c Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Mon, 29 May 2023 15:52:50 +0200 Subject: [PATCH 0159/1072] Enable `enable_memory_bound_merging_of_aggregation_results` by default --- src/Core/Settings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 607be1522db..10d94f8d218 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -143,7 +143,7 @@ class IColumn; M(UInt64, group_by_two_level_threshold_bytes, 50000000, "From what size of the aggregation state in bytes, a two-level aggregation begins to be used. 0 - the threshold is not set. Two-level aggregation is used when at least one of the thresholds is triggered.", 0) \ M(Bool, distributed_aggregation_memory_efficient, true, "Is the memory-saving mode of distributed aggregation enabled.", 0) \ M(UInt64, aggregation_memory_efficient_merge_threads, 0, "Number of threads to use for merge intermediate aggregation results in memory efficient mode. When bigger, then more memory is consumed. 0 means - same as 'max_threads'.", 0) \ - M(Bool, enable_memory_bound_merging_of_aggregation_results, false, "Enable memory bound merging strategy for aggregation. Set it to true only if all nodes of your clusters have versions >= 22.12.", 0) \ + M(Bool, enable_memory_bound_merging_of_aggregation_results, true, "Enable memory bound merging strategy for aggregation.", 0) \ M(Bool, enable_positional_arguments, true, "Enable positional arguments in ORDER BY, GROUP BY and LIMIT BY", 0) \ M(Bool, enable_extended_results_for_datetime_functions, false, "Enable date functions like toLastDayOfMonth return Date32 results (instead of Date results) for Date32/DateTime64 arguments.", 0) \ \ From e10f951467390f5deb67b7fe12e38f546059ec9a Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Mon, 29 May 2023 19:09:07 +0000 Subject: [PATCH 0160/1072] Fix IS (NOT) NULL operator priority --- src/Parsers/ExpressionListParsers.cpp | 66 +++++++++---------- .../02752_is_null_priority.reference | 16 +++++ .../0_stateless/02752_is_null_priority.sql | 1 + 3 files changed, 50 insertions(+), 33 deletions(-) create mode 100644 tests/queries/0_stateless/02752_is_null_priority.reference create mode 100644 tests/queries/0_stateless/02752_is_null_priority.sql diff --git a/src/Parsers/ExpressionListParsers.cpp b/src/Parsers/ExpressionListParsers.cpp index 1477f3d000d..8903b2b02b0 100644 --- a/src/Parsers/ExpressionListParsers.cpp +++ b/src/Parsers/ExpressionListParsers.cpp @@ -2332,44 +2332,44 @@ const std::vector> ParserExpressionImpl::o {":", Operator("if", 3, 3, OperatorType::FinishIf)}, {"OR", Operator("or", 3, 2, OperatorType::Mergeable)}, {"AND", Operator("and", 4, 2, OperatorType::Mergeable)}, - {"BETWEEN", Operator("", 6, 0, OperatorType::StartBetween)}, - {"NOT BETWEEN", Operator("", 6, 0, OperatorType::StartNotBetween)}, - {"==", Operator("equals", 8, 2, OperatorType::Comparison)}, - {"!=", Operator("notEquals", 8, 2, OperatorType::Comparison)}, - {"<>", Operator("notEquals", 8, 2, OperatorType::Comparison)}, - {"<=", Operator("lessOrEquals", 8, 2, OperatorType::Comparison)}, - {">=", Operator("greaterOrEquals", 8, 2, OperatorType::Comparison)}, - {"<", Operator("less", 8, 2, OperatorType::Comparison)}, - {">", Operator("greater", 8, 2, OperatorType::Comparison)}, - {"=", Operator("equals", 8, 2, OperatorType::Comparison)}, - {"LIKE", Operator("like", 8, 2)}, - {"ILIKE", Operator("ilike", 8, 2)}, - {"NOT LIKE", Operator("notLike", 8, 2)}, - {"NOT ILIKE", Operator("notILike", 8, 2)}, - {"REGEXP", Operator("match", 8, 2)}, - {"IN", Operator("in", 8, 2)}, - {"NOT IN", Operator("notIn", 8, 2)}, - {"GLOBAL IN", Operator("globalIn", 8, 2)}, - {"GLOBAL NOT IN", Operator("globalNotIn", 8, 2)}, - {"||", Operator("concat", 9, 2, OperatorType::Mergeable)}, - {"+", Operator("plus", 10, 2)}, - {"-", Operator("minus", 10, 2)}, - {"*", Operator("multiply", 11, 2)}, - {"/", Operator("divide", 11, 2)}, - {"%", Operator("modulo", 11, 2)}, - {"MOD", Operator("modulo", 11, 2)}, - {"DIV", Operator("intDiv", 11, 2)}, - {".", Operator("tupleElement", 13, 2, OperatorType::TupleElement)}, - {"[", Operator("arrayElement", 13, 2, OperatorType::ArrayElement)}, - {"::", Operator("CAST", 13, 2, OperatorType::Cast)}, - {"IS NULL", Operator("isNull", 13, 1, OperatorType::IsNull)}, - {"IS NOT NULL", Operator("isNotNull", 13, 1, OperatorType::IsNull)}, + {"IS NULL", Operator("isNull", 6, 1, OperatorType::IsNull)}, + {"IS NOT NULL", Operator("isNotNull", 6, 1, OperatorType::IsNull)}, + {"BETWEEN", Operator("", 7, 0, OperatorType::StartBetween)}, + {"NOT BETWEEN", Operator("", 7, 0, OperatorType::StartNotBetween)}, + {"==", Operator("equals", 9, 2, OperatorType::Comparison)}, + {"!=", Operator("notEquals", 9, 2, OperatorType::Comparison)}, + {"<>", Operator("notEquals", 9, 2, OperatorType::Comparison)}, + {"<=", Operator("lessOrEquals", 9, 2, OperatorType::Comparison)}, + {">=", Operator("greaterOrEquals", 9, 2, OperatorType::Comparison)}, + {"<", Operator("less", 9, 2, OperatorType::Comparison)}, + {">", Operator("greater", 9, 2, OperatorType::Comparison)}, + {"=", Operator("equals", 9, 2, OperatorType::Comparison)}, + {"LIKE", Operator("like", 9, 2)}, + {"ILIKE", Operator("ilike", 9, 2)}, + {"NOT LIKE", Operator("notLike", 9, 2)}, + {"NOT ILIKE", Operator("notILike", 9, 2)}, + {"REGEXP", Operator("match", 9, 2)}, + {"IN", Operator("in", 9, 2)}, + {"NOT IN", Operator("notIn", 9, 2)}, + {"GLOBAL IN", Operator("globalIn", 9, 2)}, + {"GLOBAL NOT IN", Operator("globalNotIn", 9, 2)}, + {"||", Operator("concat", 10, 2, OperatorType::Mergeable)}, + {"+", Operator("plus", 11, 2)}, + {"-", Operator("minus", 11, 2)}, + {"*", Operator("multiply", 12, 2)}, + {"/", Operator("divide", 12, 2)}, + {"%", Operator("modulo", 12, 2)}, + {"MOD", Operator("modulo", 12, 2)}, + {"DIV", Operator("intDiv", 12, 2)}, + {".", Operator("tupleElement", 14, 2, OperatorType::TupleElement)}, + {"[", Operator("arrayElement", 14, 2, OperatorType::ArrayElement)}, + {"::", Operator("CAST", 14, 2, OperatorType::Cast)}, }; const std::vector> ParserExpressionImpl::unary_operators_table { {"NOT", Operator("not", 5, 1)}, - {"-", Operator("negate", 12, 1)} + {"-", Operator("negate", 13, 1)} }; const Operator ParserExpressionImpl::finish_between_operator("", 7, 0, OperatorType::FinishBetween); diff --git a/tests/queries/0_stateless/02752_is_null_priority.reference b/tests/queries/0_stateless/02752_is_null_priority.reference new file mode 100644 index 00000000000..54606882ddc --- /dev/null +++ b/tests/queries/0_stateless/02752_is_null_priority.reference @@ -0,0 +1,16 @@ +SelectWithUnionQuery (children 1) + ExpressionList (children 1) + SelectQuery (children 1) + ExpressionList (children 2) + Function isNull (children 1) + ExpressionList (children 1) + Function multiply (children 1) + ExpressionList (children 2) + Identifier a + Identifier b + Function isNotNull (children 1) + ExpressionList (children 1) + Function multiply (children 1) + ExpressionList (children 2) + Identifier a + Identifier b diff --git a/tests/queries/0_stateless/02752_is_null_priority.sql b/tests/queries/0_stateless/02752_is_null_priority.sql new file mode 100644 index 00000000000..a0a9741e752 --- /dev/null +++ b/tests/queries/0_stateless/02752_is_null_priority.sql @@ -0,0 +1 @@ +EXPLAIN AST SELECT a * b IS NULL, a * b IS NOT NULL; From 954e76cb7a432081df8865a2988749fec3b950e4 Mon Sep 17 00:00:00 2001 From: Lucas Chang Date: Tue, 30 May 2023 09:38:50 +0800 Subject: [PATCH 0161/1072] use getScaleMultiplier --- src/Interpreters/convertFieldToType.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index 79251d944c8..1ea9ecd7e00 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -194,13 +194,15 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID } else if (which_type.isDateTime64() && which_from_type.isDate()) { - const auto value = static_cast(type).getTimeZone().fromDayNum(DayNum(src.get())); - return DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, 1); + const DataTypeDateTime64 & data_type_date_time64 = static_cast(type); + const Int64 value = data_type_date_time64.getTimeZone().fromDayNum(DayNum(src.get())); + return DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, data_type_date_time64.getScaleMultiplier()); } else if (which_type.isDateTime64() && which_from_type.isDate32()) { - const auto value = static_cast(type).getTimeZone().fromDayNum(DayNum(src.get())); - return DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, 1); + const DataTypeDateTime64 & data_type_date_time64 = static_cast(type); + const Int64 value = data_type_date_time64.getTimeZone().fromDayNum(DayNum(src.get())); + return DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, data_type_date_time64.getScaleMultiplier()); } else if (type.isValueRepresentedByNumber() && src.getType() != Field::Types::String) { From ecea1ac090c9ac1749951b409f32e4911a9dc005 Mon Sep 17 00:00:00 2001 From: Val Doroshchuk Date: Thu, 25 May 2023 14:25:08 +0200 Subject: [PATCH 0162/1072] Fix crash when Pool::Entry::disconnect() is called Many Pool::Entry objects can keep the same pointer to Pool::Connection. If Pool::Entry::disconnect() is called on one such object, Pool::removeConnection() is called to remove Pool::Connection from the pool, where connection->ref_count is cleared and connection->removed_from_pool is set. Next Pool::Entry::~Entry() calls decrementRefCount() with 1. const auto ref_count = data->ref_count.fetch_sub(1); where data->ref_count will be negative, since it was cleared 2. checks removed_from_pool and deletes Pool::Connection but there might be multiple Entry objects still keep pointer to this Pool::Connection Suggesting not to clear ref_count on disconnect() and delete Pool::Connection only on the last Pool::Entry is being destroyed. Fixes ea375ef9890f18be7038f66da2d731010ed4462f --- src/Common/mysqlxx/Pool.cpp | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/Common/mysqlxx/Pool.cpp b/src/Common/mysqlxx/Pool.cpp index 6438d76cc3a..d10889d1f97 100644 --- a/src/Common/mysqlxx/Pool.cpp +++ b/src/Common/mysqlxx/Pool.cpp @@ -40,14 +40,10 @@ void Pool::Entry::decrementRefCount() { /// We were the last user of this thread, deinitialize it mysql_thread_end(); - } - else if (data->removed_from_pool) - { - /// data->ref_count == 0 in case we removed connection from pool (see Pool::removeConnection). - chassert(ref_count == 0); /// In Pool::Entry::disconnect() we remove connection from the list of pool's connections. /// So now we must deallocate the memory. - ::delete data; + if (data->removed_from_pool) + ::delete data; } } @@ -234,11 +230,8 @@ void Pool::removeConnection(Connection* connection) std::lock_guard lock(mutex); if (connection) { - if (connection->ref_count > 0) - { + if (!connection->removed_from_pool) connection->conn.disconnect(); - connection->ref_count = 0; - } connections.remove(connection); connection->removed_from_pool = true; } From fe82d2bbe24a98bf2192796f47a9b1e2b5d40b3a Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 30 May 2023 12:58:37 +0200 Subject: [PATCH 0163/1072] Fix build --- src/Common/checkSSLReturnCode.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Common/checkSSLReturnCode.cpp b/src/Common/checkSSLReturnCode.cpp index 8916a25e19c..bc87af1d37d 100644 --- a/src/Common/checkSSLReturnCode.cpp +++ b/src/Common/checkSSLReturnCode.cpp @@ -8,7 +8,7 @@ namespace DB { -bool checkSSLWantRead(ssize_t res) +bool checkSSLWantRead([[maybe_unused]] ssize_t res) { #if USE_SSL return res == Poco::Net::SecureStreamSocket::ERR_SSL_WANT_READ; @@ -17,7 +17,7 @@ bool checkSSLWantRead(ssize_t res) #endif } -bool checkSSLWantWrite(ssize_t res) +bool checkSSLWantWrite([[maybe_unused]] ssize_t res) { #if USE_SSL return res == Poco::Net::SecureStreamSocket::ERR_SSL_WANT_WRITE; From c25980bcf4ecae8aaec15b75421b3a187b410ab2 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 30 May 2023 12:02:44 +0000 Subject: [PATCH 0164/1072] Trying to fix toDateOrDefault() --- src/Functions/FunctionsConversion.h | 7 +++++++ .../01746_convert_type_with_default.reference | 4 ++++ .../0_stateless/01746_convert_type_with_default.sql | 10 ++++++++++ 3 files changed, 21 insertions(+) diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 940585d6d57..d3676349318 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -2882,6 +2882,13 @@ private: return true; } + if constexpr (IsDataTypeNumber && IsDataTypeDateOrDateTime) + { + result_column = ConvertImpl::execute( + arguments, result_type, input_rows_count); + return true; + } + return false; }); diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.reference b/tests/queries/0_stateless/01746_convert_type_with_default.reference index 9ebef9c4a8d..ec2a826982f 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.reference +++ b/tests/queries/0_stateless/01746_convert_type_with_default.reference @@ -20,5 +20,9 @@ 2 -1 -2 +2023-05-30 +2023-05-30 +2023-05-30 14:38:20 +2023-05-30 14:38:20 61f0c404-5cb3-11e7-907b-a6006ad3dba0 59f0c404-5cb3-11e7-907b-a6006ad3dba0 diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.sql b/tests/queries/0_stateless/01746_convert_type_with_default.sql index 0881e911466..9d7873081e5 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.sql +++ b/tests/queries/0_stateless/01746_convert_type_with_default.sql @@ -26,5 +26,15 @@ select toUInt256OrDefault('1xx', cast(2 as UInt256)); select toInt256OrDefault('-1', cast(-2 as Int256)); select toInt256OrDefault('-1xx', cast(-2 as Int256)); +select toDateOrDefault('2023-05-30'); +select toDateOrDefault(19507); + +select toDateTimeOrDefault('2023-05-30 14:38:20'); +select toDateTimeOrDefault(1685457500); + SELECT toUUIDOrDefault('61f0c404-5cb3-11e7-907b-a6006ad3dba0', cast('59f0c404-5cb3-11e7-907b-a6006ad3dba0' as UUID)); SELECT toUUIDOrDefault('-----61f0c404-5cb3-11e7-907b-a6006ad3dba0', cast('59f0c404-5cb3-11e7-907b-a6006ad3dba0' as UUID)); + + + + From 391e61549d59d338d475732704023092e34faf4a Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Tue, 30 May 2023 12:18:56 +0000 Subject: [PATCH 0165/1072] Fixes --- src/Parsers/ExpressionListParsers.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/Parsers/ExpressionListParsers.cpp b/src/Parsers/ExpressionListParsers.cpp index 8903b2b02b0..18a6de5b4f6 100644 --- a/src/Parsers/ExpressionListParsers.cpp +++ b/src/Parsers/ExpressionListParsers.cpp @@ -2332,8 +2332,8 @@ const std::vector> ParserExpressionImpl::o {":", Operator("if", 3, 3, OperatorType::FinishIf)}, {"OR", Operator("or", 3, 2, OperatorType::Mergeable)}, {"AND", Operator("and", 4, 2, OperatorType::Mergeable)}, - {"IS NULL", Operator("isNull", 6, 1, OperatorType::IsNull)}, - {"IS NOT NULL", Operator("isNotNull", 6, 1, OperatorType::IsNull)}, + {"IS NULL", Operator("isNull", 6, 1, OperatorType::IsNull)}, + {"IS NOT NULL", Operator("isNotNull", 6, 1, OperatorType::IsNull)}, {"BETWEEN", Operator("", 7, 0, OperatorType::StartBetween)}, {"NOT BETWEEN", Operator("", 7, 0, OperatorType::StartNotBetween)}, {"==", Operator("equals", 9, 2, OperatorType::Comparison)}, @@ -2353,7 +2353,7 @@ const std::vector> ParserExpressionImpl::o {"NOT IN", Operator("notIn", 9, 2)}, {"GLOBAL IN", Operator("globalIn", 9, 2)}, {"GLOBAL NOT IN", Operator("globalNotIn", 9, 2)}, - {"||", Operator("concat", 10, 2, OperatorType::Mergeable)}, + {"||", Operator("concat", 10, 2, OperatorType::Mergeable)}, {"+", Operator("plus", 11, 2)}, {"-", Operator("minus", 11, 2)}, {"*", Operator("multiply", 12, 2)}, @@ -2372,7 +2372,7 @@ const std::vector> ParserExpressionImpl::u {"-", Operator("negate", 13, 1)} }; -const Operator ParserExpressionImpl::finish_between_operator("", 7, 0, OperatorType::FinishBetween); +const Operator ParserExpressionImpl::finish_between_operator("", 8, 0, OperatorType::FinishBetween); const std::array ParserExpressionImpl::overlapping_operators_to_skip { @@ -2392,6 +2392,7 @@ bool ParserExpressionImpl::parse(std::unique_ptr start, IParser::Pos & po { if (!layers.back()->parse(pos, expected, next)) break; + if (layers.back()->isFinished()) { if (layers.size() == 1) From 092b06f32eb7e4e8b0c4f379632d560810393299 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 30 May 2023 14:36:47 +0200 Subject: [PATCH 0166/1072] Fix special build --- src/Common/checkSSLReturnCode.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Common/checkSSLReturnCode.cpp b/src/Common/checkSSLReturnCode.cpp index bc87af1d37d..353c287813d 100644 --- a/src/Common/checkSSLReturnCode.cpp +++ b/src/Common/checkSSLReturnCode.cpp @@ -8,7 +8,7 @@ namespace DB { -bool checkSSLWantRead([[maybe_unused]] ssize_t res) +bool checkSSLWantRead([[maybe_unused]] ssize_t ret) { #if USE_SSL return res == Poco::Net::SecureStreamSocket::ERR_SSL_WANT_READ; @@ -17,7 +17,7 @@ bool checkSSLWantRead([[maybe_unused]] ssize_t res) #endif } -bool checkSSLWantWrite([[maybe_unused]] ssize_t res) +bool checkSSLWantWrite([[maybe_unused]] ssize_t ret) { #if USE_SSL return res == Poco::Net::SecureStreamSocket::ERR_SSL_WANT_WRITE; From 93cf34320c12d17554d047497928cac1c177937b Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Tue, 30 May 2023 16:10:52 +0200 Subject: [PATCH 0167/1072] Fixed convertFieldToType case of converting Date and Date32 to DateTime64 Field Also added a unit-test that clarifies implementation --- src/Interpreters/convertFieldToType.cpp | 16 +- .../tests/gtest_convertFieldToType.cpp | 185 ++++++++++++++++++ 2 files changed, 195 insertions(+), 6 deletions(-) create mode 100644 src/Interpreters/tests/gtest_convertFieldToType.cpp diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index 1ea9ecd7e00..ff09175f898 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -194,15 +194,19 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID } else if (which_type.isDateTime64() && which_from_type.isDate()) { - const DataTypeDateTime64 & data_type_date_time64 = static_cast(type); - const Int64 value = data_type_date_time64.getTimeZone().fromDayNum(DayNum(src.get())); - return DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, data_type_date_time64.getScaleMultiplier()); + const auto & date_time64_type = static_cast(type); + const auto value = date_time64_type.getTimeZone().fromDayNum(DayNum(src.get())); + return DecimalField( + DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, date_time64_type.getScaleMultiplier()), + date_time64_type.getScale()); } else if (which_type.isDateTime64() && which_from_type.isDate32()) { - const DataTypeDateTime64 & data_type_date_time64 = static_cast(type); - const Int64 value = data_type_date_time64.getTimeZone().fromDayNum(DayNum(src.get())); - return DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, data_type_date_time64.getScaleMultiplier()); + const auto & date_time64_type = static_cast(type); + const auto value = date_time64_type.getTimeZone().fromDayNum(ExtendedDayNum(static_cast(src.get()))); + return DecimalField( + DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, date_time64_type.getScaleMultiplier()), + date_time64_type.getScale()); } else if (type.isValueRepresentedByNumber() && src.getType() != Field::Types::String) { diff --git a/src/Interpreters/tests/gtest_convertFieldToType.cpp b/src/Interpreters/tests/gtest_convertFieldToType.cpp new file mode 100644 index 00000000000..5421c192ac7 --- /dev/null +++ b/src/Interpreters/tests/gtest_convertFieldToType.cpp @@ -0,0 +1,185 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include "base/Decimal.h" +#include "base/types.h" +#include "gtest/gtest.h" + +using namespace DB; + +struct ConvertFieldToTypeTestParams +{ + const char * from_type; // MUST NOT BE NULL + const Field from_value; + const char * to_type; // MUST NOT BE NULL + const std::optional expected_value; +}; + +std::ostream & operator << (std::ostream & ostr, const ConvertFieldToTypeTestParams & params) +{ + return ostr << "{" + << "\n\tfrom_type : " << params.from_type + << "\n\tfrom_value : " << params.from_value + << "\n\tto_type : " << params.to_type + << "\n\texpected : " << (params.expected_value ? *params.expected_value : Field()) + << "\n}"; +} + +class ConvertFieldToTypeTest : public ::testing::TestWithParam +{}; + +TEST_P(ConvertFieldToTypeTest, convert) +{ + const auto & params = GetParam(); + + ASSERT_NE(nullptr, params.from_type); + ASSERT_NE(nullptr, params.to_type); + + const auto & type_factory = DataTypeFactory::instance(); + const auto from_type = type_factory.get(params.from_type); + const auto to_type = type_factory.get(params.to_type); + + if (params.expected_value) + { + const auto result = convertFieldToType(params.from_value, *to_type, from_type.get()); + EXPECT_EQ(*params.expected_value, result); + } + else + { + EXPECT_ANY_THROW(convertFieldToType(params.from_value, *to_type, from_type.get())); + } +} + +// Basically nuber of seconds in a day, works for UTC here +const long long int Day = 24 * 60 * 60; + +// 123 is arbitrary value here + +INSTANTIATE_TEST_SUITE_P( + DateToDateTime64, + ConvertFieldToTypeTest, + ::testing::ValuesIn(std::initializer_list{ + // min value of Date + { + "Date", + Field(0), + "DateTime64(0, 'UTC')", + DecimalField(DateTime64(0), 0) + }, + // Max value of Date + { + "Date", + Field(std::numeric_limits::max()), + "DateTime64(0, 'UTC')", + DecimalField(DateTime64(std::numeric_limits::max() * Day), 0) + }, + // check that scale is respected + { + "Date", + Field(123), + "DateTime64(0, 'UTC')", + DecimalField(DateTime64(123 * Day), 0) + }, + { + "Date", + Field(1), + "DateTime64(1, 'UTC')", + DecimalField(DateTime64(Day * 10), 1) + }, + { + "Date", + Field(123), + "DateTime64(3, 'UTC')", + DecimalField(DateTime64(123 * Day * 1000), 3) + }, + { + "Date", + Field(123), + "DateTime64(6, 'UTC')", + DecimalField(DateTime64(123 * Day * 1'000'000), 6) + }, + }) +); + +INSTANTIATE_TEST_SUITE_P( + Date32ToDateTime64, + ConvertFieldToTypeTest, + ::testing::ValuesIn(std::initializer_list{ + // min value of Date32: 1st Jan 1900 (see DATE_LUT_MIN_YEAR) + { + "Date32", + Field(-25'567), + "DateTime64(0, 'UTC')", + DecimalField(DateTime64(-25'567 * Day), 0) + }, + // max value of Date32: 31 Dec 2299 (see DATE_LUT_MAX_YEAR) + { + "Date32", + Field(120'529), + "DateTime64(0, 'UTC')", + DecimalField(DateTime64(120'529 * Day), 0) + }, + // check that scale is respected + { + "Date32", + Field(123), + "DateTime64(0, 'UTC')", + DecimalField(DateTime64(123 * Day), 0) + }, + { + "Date32", + Field(123), + "DateTime64(1, 'UTC')", + DecimalField(DateTime64(123 * Day * 10), 1) + }, + { + "Date32", + Field(123), + "DateTime64(3, 'UTC')", + DecimalField(DateTime64(123 * Day * 1000), 3) + }, + { + "Date32", + Field(123), + "DateTime64(6, 'UTC')", + DecimalField(DateTime64(123 * Day * 1'000'000), 6) + } + }) + ); + +INSTANTIATE_TEST_SUITE_P( + DateTimeToDateTime64, + ConvertFieldToTypeTest, + ::testing::ValuesIn(std::initializer_list{ + { + "DateTime", + Field(1), + "DateTime64(0, 'UTC')", + DecimalField(DateTime64(1), 0) + }, + { + "DateTime", + Field(1), + "DateTime64(1, 'UTC')", + DecimalField(DateTime64(1'0), 1) + }, + { + "DateTime", + Field(123), + "DateTime64(3, 'UTC')", + DecimalField(DateTime64(123'000), 3) + }, + { + "DateTime", + Field(123), + "DateTime64(6, 'UTC')", + DecimalField(DateTime64(123'000'000), 6) + }, + }) +); From 1c904ecc8ef5886ec90221f52c02758a26a5b81e Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 30 May 2023 16:25:17 +0200 Subject: [PATCH 0168/1072] Fix typo --- src/Common/checkSSLReturnCode.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Common/checkSSLReturnCode.cpp b/src/Common/checkSSLReturnCode.cpp index 353c287813d..ffb912da18a 100644 --- a/src/Common/checkSSLReturnCode.cpp +++ b/src/Common/checkSSLReturnCode.cpp @@ -11,7 +11,7 @@ namespace DB bool checkSSLWantRead([[maybe_unused]] ssize_t ret) { #if USE_SSL - return res == Poco::Net::SecureStreamSocket::ERR_SSL_WANT_READ; + return ret == Poco::Net::SecureStreamSocket::ERR_SSL_WANT_READ; #else return false; #endif @@ -20,7 +20,7 @@ bool checkSSLWantRead([[maybe_unused]] ssize_t ret) bool checkSSLWantWrite([[maybe_unused]] ssize_t ret) { #if USE_SSL - return res == Poco::Net::SecureStreamSocket::ERR_SSL_WANT_WRITE; + return ret == Poco::Net::SecureStreamSocket::ERR_SSL_WANT_WRITE; #else return false; #endif From 6c9b7a710c083041b25159109e8cf786c5875ba0 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 30 May 2023 14:46:44 +0000 Subject: [PATCH 0169/1072] Added more tests for toDateOrDefault/toDateTimeOrDefault --- .../01746_convert_type_with_default.reference | 8 ++++++++ .../01746_convert_type_with_default.sql | 14 +++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.reference b/tests/queries/0_stateless/01746_convert_type_with_default.reference index ec2a826982f..235a88157c8 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.reference +++ b/tests/queries/0_stateless/01746_convert_type_with_default.reference @@ -20,8 +20,16 @@ 2 -1 -2 +1970-01-01 2023-05-30 2023-05-30 +2023-05-30 +1970-01-01 +2023-05-30 +2023-05-30 +1970-01-01 +2023-05-30 14:38:20 +2023-05-30 14:38:20 2023-05-30 14:38:20 2023-05-30 14:38:20 61f0c404-5cb3-11e7-907b-a6006ad3dba0 diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.sql b/tests/queries/0_stateless/01746_convert_type_with_default.sql index 9d7873081e5..18b5ae60920 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.sql +++ b/tests/queries/0_stateless/01746_convert_type_with_default.sql @@ -26,15 +26,19 @@ select toUInt256OrDefault('1xx', cast(2 as UInt256)); select toInt256OrDefault('-1', cast(-2 as Int256)); select toInt256OrDefault('-1xx', cast(-2 as Int256)); +select toDateOrDefault('2020-0x-02'); select toDateOrDefault('2023-05-30'); +select toDateOrDefault('2023-05-30', '2000-01-01'::Date); +select toDateOrDefault('2020-0x-02', '2023-05-30'::Date); +select toDateOrDefault(-1); select toDateOrDefault(19507); +select toDateOrDefault(19507, '2000-01-01'::Date); +select toDateOrDefault(-1, '2000-01-01'::Date); select toDateTimeOrDefault('2023-05-30 14:38:20'); +select toDateTimeOrDefault('2023-05-30 14:38:20', 'UTC'); +select toDateTimeOrDefault('s2023', 'UTC', '2023-05-30 14:38:20'::DateTime('UTC')); select toDateTimeOrDefault(1685457500); SELECT toUUIDOrDefault('61f0c404-5cb3-11e7-907b-a6006ad3dba0', cast('59f0c404-5cb3-11e7-907b-a6006ad3dba0' as UUID)); -SELECT toUUIDOrDefault('-----61f0c404-5cb3-11e7-907b-a6006ad3dba0', cast('59f0c404-5cb3-11e7-907b-a6006ad3dba0' as UUID)); - - - - +SELECT toUUIDOrDefault('-----61f0c404-5cb3-11e7-907b-a6006ad3dba0', cast('59f0c404-5cb3-11e7-907b-a6006ad3dba0' as UUID)); \ No newline at end of file From c8bb1f64ad21dea5ba63fa8f2ea8434d90f9e823 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 30 May 2023 18:46:49 +0200 Subject: [PATCH 0170/1072] fix --- src/Storages/StorageReplicatedMergeTree.cpp | 3 +++ tests/integration/test_lost_part/test.py | 12 ++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index e71f5217c2b..35f75880ced 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -5469,6 +5469,7 @@ void StorageReplicatedMergeTree::alter( if (mutation_znode) { LOG_DEBUG(log, "Metadata changes applied. Will wait for data changes."); + merge_selecting_task->schedule(); waitMutation(*mutation_znode, query_context->getSettingsRef().alter_sync); LOG_DEBUG(log, "Data changes applied."); } @@ -6620,6 +6621,8 @@ void StorageReplicatedMergeTree::mutate(const MutationCommands & commands, Conte throw Coordination::Exception("Unable to create a mutation znode", rc); } + merge_selecting_task->schedule(); + waitMutation(mutation_entry.znode_name, query_context->getSettingsRef().mutations_sync); } diff --git a/tests/integration/test_lost_part/test.py b/tests/integration/test_lost_part/test.py index 44cd19fd1fb..0bc24268040 100644 --- a/tests/integration/test_lost_part/test.py +++ b/tests/integration/test_lost_part/test.py @@ -42,7 +42,8 @@ def test_lost_part_same_replica(start_cluster): for node in [node1, node2]: node.query( f"CREATE TABLE mt0 (id UInt64, date Date) ENGINE ReplicatedMergeTree('/clickhouse/tables/t', '{node.name}') ORDER BY tuple() PARTITION BY date " - "SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0" + "SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0," + "merge_selecting_sleep_ms=100, max_merge_selecting_sleep_ms=1000" ) node1.query("SYSTEM STOP MERGES mt0") @@ -109,7 +110,8 @@ def test_lost_part_other_replica(start_cluster): for node in [node1, node2]: node.query( f"CREATE TABLE mt1 (id UInt64) ENGINE ReplicatedMergeTree('/clickhouse/tables/t1', '{node.name}') ORDER BY tuple() " - "SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0" + "SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0," + "merge_selecting_sleep_ms=100, max_merge_selecting_sleep_ms=1000" ) node1.query("SYSTEM STOP MERGES mt1") @@ -178,7 +180,8 @@ def test_lost_part_mutation(start_cluster): for node in [node1, node2]: node.query( f"CREATE TABLE mt2 (id UInt64) ENGINE ReplicatedMergeTree('/clickhouse/tables/t2', '{node.name}') ORDER BY tuple() " - "SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0" + "SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0," + "merge_selecting_sleep_ms=100, max_merge_selecting_sleep_ms=1000" ) node1.query("SYSTEM STOP MERGES mt2") @@ -241,7 +244,8 @@ def test_lost_last_part(start_cluster): for node in [node1, node2]: node.query( f"CREATE TABLE mt3 (id UInt64, p String) ENGINE ReplicatedMergeTree('/clickhouse/tables/t3', '{node.name}') " - "ORDER BY tuple() PARTITION BY p SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0" + "ORDER BY tuple() PARTITION BY p SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0," + "merge_selecting_sleep_ms=100, max_merge_selecting_sleep_ms=1000" ) node1.query("SYSTEM STOP MERGES mt3") From 1a6517d4a0904908dbc09fc5537c1bf08a49a5c9 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Tue, 30 May 2023 17:13:28 +0000 Subject: [PATCH 0171/1072] Fix priority of the operators after IS NULL --- src/Parsers/ExpressionListParsers.cpp | 12 ++++++++++-- .../0_stateless/02477_is_null_parser.reference | 4 ++-- tests/queries/0_stateless/02477_is_null_parser.sql | 4 ++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/Parsers/ExpressionListParsers.cpp b/src/Parsers/ExpressionListParsers.cpp index 18a6de5b4f6..cd399531064 100644 --- a/src/Parsers/ExpressionListParsers.cpp +++ b/src/Parsers/ExpressionListParsers.cpp @@ -2736,11 +2736,19 @@ Action ParserExpressionImpl::tryParseOperator(Layers & layers, IParser::Pos & po } } - layers.back()->pushOperator(op); - /// isNull & isNotNull are postfix unary operators if (op.type == OperatorType::IsNull) + { + ASTPtr function = makeASTFunction(op); + + if (!layers.back()->popLastNOperands(function->children[0]->children, 1)) + return Action::NONE; + + layers.back()->pushOperand(std::move(function)); return Action::OPERATOR; + } + + layers.back()->pushOperator(op); if (op.type == OperatorType::Cast) { diff --git a/tests/queries/0_stateless/02477_is_null_parser.reference b/tests/queries/0_stateless/02477_is_null_parser.reference index 57d96862011..2820f5ec2db 100644 --- a/tests/queries/0_stateless/02477_is_null_parser.reference +++ b/tests/queries/0_stateless/02477_is_null_parser.reference @@ -1,3 +1,3 @@ -SELECT (\'a\' IS NULL) + (\'b\' IS NOT NULL) -SELECT (\'a\' IS NULL) = 0 +SELECT ((1 IS NULL) + 1) IS NOT NULL +SELECT (1 IS NULL) = 0 SELECT CAST(1 IS NULL, \'Int32\') diff --git a/tests/queries/0_stateless/02477_is_null_parser.sql b/tests/queries/0_stateless/02477_is_null_parser.sql index b95a35fde21..f3ec0affd85 100644 --- a/tests/queries/0_stateless/02477_is_null_parser.sql +++ b/tests/queries/0_stateless/02477_is_null_parser.sql @@ -1,3 +1,3 @@ -EXPLAIN SYNTAX SELECT 'a' IS NULL + 'b' IS NOT NULL; -EXPLAIN SYNTAX SELECT 'a' IS NULL = 0; +EXPLAIN SYNTAX SELECT 1 IS NULL + 1 IS NOT NULL; +EXPLAIN SYNTAX SELECT 1 IS NULL = 0; EXPLAIN SYNTAX SELECT 1 IS NULL :: Int32; From daaae3f573c03be49c9e015c249642034113374d Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 30 May 2023 19:11:26 +0000 Subject: [PATCH 0172/1072] Add toString() to fix time zone error --- tests/queries/0_stateless/01746_convert_type_with_default.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.sql b/tests/queries/0_stateless/01746_convert_type_with_default.sql index 18b5ae60920..40e4798721b 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.sql +++ b/tests/queries/0_stateless/01746_convert_type_with_default.sql @@ -37,7 +37,7 @@ select toDateOrDefault(-1, '2000-01-01'::Date); select toDateTimeOrDefault('2023-05-30 14:38:20'); select toDateTimeOrDefault('2023-05-30 14:38:20', 'UTC'); -select toDateTimeOrDefault('s2023', 'UTC', '2023-05-30 14:38:20'::DateTime('UTC')); +select toString(toDateTimeOrDefault('s2023', 'Asia/Istanbul', '2023-05-30 14:38:20'::DateTime('Asia/Istanbul')), 'Asia/Istanbul'); select toDateTimeOrDefault(1685457500); SELECT toUUIDOrDefault('61f0c404-5cb3-11e7-907b-a6006ad3dba0', cast('59f0c404-5cb3-11e7-907b-a6006ad3dba0' as UUID)); From d4efbbfbd3ca954b83391c60afffe760cc602361 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 30 May 2023 19:32:24 +0000 Subject: [PATCH 0173/1072] Allow to skip empty files in file/s3/url/hdfs table functions --- src/Core/Settings.h | 4 + src/Storages/HDFS/StorageHDFS.cpp | 113 +++++++++++------- src/Storages/HDFS/StorageHDFS.h | 23 +++- src/Storages/HDFS/StorageHDFSCluster.cpp | 2 +- src/Storages/StorageFile.cpp | 59 ++++++--- src/Storages/StorageS3.cpp | 12 +- src/Storages/StorageURL.cpp | 43 ++++--- src/Storages/StorageURL.h | 2 +- tests/integration/test_storage_hdfs/test.py | 50 ++++++++ tests/integration/test_storage_s3/test.py | 55 +++++++++ .../02771_skip_empty_files.reference | 7 ++ .../0_stateless/02771_skip_empty_files.sh | 24 ++++ 12 files changed, 307 insertions(+), 87 deletions(-) create mode 100644 tests/queries/0_stateless/02771_skip_empty_files.reference create mode 100755 tests/queries/0_stateless/02771_skip_empty_files.sh diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 607be1522db..534cb629aa8 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -91,6 +91,7 @@ class IColumn; M(UInt64, s3_list_object_keys_size, 1000, "Maximum number of files that could be returned in batch by ListObject request", 0) \ M(Bool, s3_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables.", 0) \ M(Bool, s3_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in s3 engine tables", 0) \ + M(Bool, s3_skip_empty_files, false, "Allow to skip empty files in s3 table engine", 0) \ M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \ M(Bool, s3_allow_parallel_part_upload, true, "Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage", 0) \ M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ @@ -99,6 +100,7 @@ class IColumn; M(UInt64, hdfs_replication, 0, "The actual number of replications can be specified when the hdfs file is created.", 0) \ M(Bool, hdfs_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables", 0) \ M(Bool, hdfs_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in hdfs engine tables", 0) \ + M(Bool, hdfs_skip_empty_files, false, "Allow to skip empty files in hdfs table engine", 0) \ M(UInt64, hsts_max_age, 0, "Expired time for hsts. 0 means disable HSTS.", 0) \ M(Bool, extremes, false, "Calculate minimums and maximums of the result columns. They can be output in JSON-formats.", IMPORTANT) \ M(Bool, use_uncompressed_cache, false, "Whether to use the cache of uncompressed blocks.", 0) \ @@ -602,6 +604,8 @@ class IColumn; M(Bool, engine_file_empty_if_not_exists, false, "Allows to select data from a file engine table without file", 0) \ M(Bool, engine_file_truncate_on_insert, false, "Enables or disables truncate before insert in file engine tables", 0) \ M(Bool, engine_file_allow_create_multiple_files, false, "Enables or disables creating a new file on each insert in file engine tables if format has suffix.", 0) \ + M(Bool, engine_file_skip_empty_files, false, "Allows to skip empty files in file table engine", 0) \ + M(Bool, engine_url_skip_empty_files, false, "Allows to skip empty files in url table engine", 0) \ M(Bool, allow_experimental_database_replicated, false, "Allow to create databases with Replicated engine", 0) \ M(UInt64, database_replicated_initial_query_timeout_sec, 300, "How long initial DDL query should wait for Replicated database to precess previous DDL queue entries", 0) \ M(Bool, database_replicated_enforce_synchronous_settings, false, "Enforces synchronous waiting for some queries (see also database_atomic_wait_for_drop_and_detach_synchronously, mutation_sync, alter_sync). Not recommended to enable these settings.", 0) \ diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 19c0840149b..08114ed3cba 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -66,7 +66,7 @@ namespace /* Recursive directory listing with matched paths as a result. * Have the same method in StorageFile. */ - Strings LSWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, const String & for_match, std::unordered_map * last_mod_times) + std::vector LSWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, const String & for_match) { const size_t first_glob = for_match.find_first_of("*?{"); @@ -88,7 +88,7 @@ namespace throw Exception( ErrorCodes::ACCESS_DENIED, "Cannot list directory {}: {}", prefix_without_globs, String(hdfsGetLastError())); } - Strings result; + std::vector result; if (!ls.file_info && ls.length > 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "file_info shouldn't be null"); for (int i = 0; i < ls.length; ++i) @@ -102,17 +102,15 @@ namespace if (!is_directory && !looking_for_directory) { if (re2::RE2::FullMatch(file_name, matcher)) - { - result.push_back(String(ls.file_info[i].mName)); - if (last_mod_times) - (*last_mod_times)[result.back()] = ls.file_info[i].mLastMod; - } + result.emplace_back( + String(ls.file_info[i].mName), + StorageHDFS::PathInfo{ls.file_info[i].mLastMod, static_cast(ls.file_info[i].mSize)}); } else if (is_directory && looking_for_directory) { if (re2::RE2::FullMatch(file_name, matcher)) { - Strings result_part = LSWithRegexpMatching(fs::path(full_path) / "", fs, suffix_with_globs.substr(next_slash), last_mod_times); + std::vector result_part = LSWithRegexpMatching(fs::path(full_path) / "", fs, suffix_with_globs.substr(next_slash)); /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check. std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); } @@ -135,12 +133,20 @@ namespace throw Exception(ErrorCodes::BAD_ARGUMENTS, "Storage HDFS requires valid URL to be set"); } - std::vector getPathsList(const String & path_from_uri, const String & uri_without_path, ContextPtr context, std::unordered_map * last_mod_times = nullptr) + std::vector getPathsList(const String & path_from_uri, const String & uri_without_path, ContextPtr context) { HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef()); HDFSFSPtr fs = createHDFSFS(builder.get()); - return LSWithRegexpMatching("/", fs, path_from_uri, last_mod_times); + return LSWithRegexpMatching("/", fs, path_from_uri); + } + + size_t getFileSize(const String & path_from_uri, const String & uri_without_path, ContextPtr context) + { + HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef()); + HDFSFSPtr fs = createHDFSFS(builder.get()); + auto * info = hdfsGetPathInfo(fs.get(), path_from_uri.data()); + return info->mSize; } } @@ -199,9 +205,8 @@ ColumnsDescription StorageHDFS::getTableStructureFromData( ContextPtr ctx) { const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); - std::unordered_map last_mod_time; - auto paths = getPathsList(path_from_uri, uri, ctx, &last_mod_time); - if (paths.empty() && !FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format)) + auto paths_with_info = getPathsList(path_from_uri, uri, ctx); + if (paths_with_info.empty() && !FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format)) throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file, because there are no files in HDFS with provided path." @@ -209,14 +214,27 @@ ColumnsDescription StorageHDFS::getTableStructureFromData( std::optional columns_from_cache; if (ctx->getSettingsRef().schema_inference_use_cache_for_hdfs) - columns_from_cache = tryGetColumnsFromCache(paths, path_from_uri, last_mod_time, format, ctx); + columns_from_cache = tryGetColumnsFromCache(paths_with_info, path_from_uri, format, ctx); - ReadBufferIterator read_buffer_iterator = [&, my_uri_without_path = uri_without_path, it = paths.begin()](ColumnsDescription &) mutable -> std::unique_ptr + ReadBufferIterator read_buffer_iterator + = [&, my_uri_without_path = uri_without_path, it = paths_with_info.begin(), first = true]( + ColumnsDescription & columns) mutable -> std::unique_ptr { - if (it == paths.end()) + if (it == paths_with_info.end()) + { + if (first) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because all files are empty. " + "You must specify table structure manually", format); return nullptr; - auto compression = chooseCompressionMethod(*it, compression_method); - auto impl = std::make_unique(my_uri_without_path, *it++, ctx->getGlobalContext()->getConfigRef(), ctx->getReadSettings()); + } + + auto path_with_info = *it++; + if (ctx->getSettingsRef().hdfs_skip_empty_files && path_with_info.info && path_with_info.info->size == 0) + return read_buffer_iterator(columns); + + auto compression = chooseCompressionMethod(path_with_info.path, compression_method); + auto impl = std::make_unique(my_uri_without_path, path_with_info.path, ctx->getGlobalContext()->getConfigRef(), ctx->getReadSettings()); const Int64 zstd_window_log_max = ctx->getSettingsRef().zstd_window_log_max; return wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)); }; @@ -225,10 +243,10 @@ ColumnsDescription StorageHDFS::getTableStructureFromData( if (columns_from_cache) columns = *columns_from_cache; else - columns = readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, paths.size() > 1, ctx); + columns = readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, paths_with_info.size() > 1, ctx); if (ctx->getSettingsRef().schema_inference_use_cache_for_hdfs) - addColumnsToCache(paths, path_from_uri, columns, format, ctx); + addColumnsToCache(paths_with_info, path_from_uri, columns, format, ctx); return columns; } @@ -241,11 +259,11 @@ public: const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); uris = getPathsList(path_from_uri, uri_without_path, context_); for (auto & elem : uris) - elem = uri_without_path + elem; + elem.path = uri_without_path + elem.path; uris_iter = uris.begin(); } - String next() + StorageHDFS::PathWithInfo next() { std::lock_guard lock(mutex); if (uris_iter != uris.end()) @@ -258,8 +276,8 @@ public: } private: std::mutex mutex; - Strings uris; - Strings::iterator uris_iter; + std::vector uris; + std::vector::iterator uris_iter; }; class HDFSSource::URISIterator::Impl @@ -279,14 +297,14 @@ public: uris_iter = uris.begin(); } - String next() + StorageHDFS::PathWithInfo next() { std::lock_guard lock(mutex); if (uris_iter == uris.end()) - return ""; + return {"", {}}; auto key = *uris_iter; ++uris_iter; - return key; + return {key, {}}; } private: @@ -298,7 +316,7 @@ private: HDFSSource::DisclosedGlobIterator::DisclosedGlobIterator(ContextPtr context_, const String & uri) : pimpl(std::make_shared(context_, uri)) {} -String HDFSSource::DisclosedGlobIterator::next() +StorageHDFS::PathWithInfo HDFSSource::DisclosedGlobIterator::next() { return pimpl->next(); } @@ -308,7 +326,7 @@ HDFSSource::URISIterator::URISIterator(const std::vector & uris_, Contex { } -String HDFSSource::URISIterator::next() +StorageHDFS::PathWithInfo HDFSSource::URISIterator::next() { return pimpl->next(); } @@ -343,12 +361,21 @@ HDFSSource::HDFSSource( bool HDFSSource::initialize() { - current_path = (*file_iterator)(); - if (current_path.empty()) + auto path_with_info = (*file_iterator)(); + if (path_with_info.path.empty()) return false; + current_path = path_with_info.path; const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(current_path); + if (getContext()->getSettingsRef().hdfs_skip_empty_files) + { + auto file_size = path_with_info.info ? path_with_info.info->size : getFileSize(path_from_uri, uri_without_path, getContext()); + /// If file is empty and hdfs_skip_empty_files=1, skip it and go to the next file. + if (file_size == 0) + return initialize(); + } + auto compression = chooseCompressionMethod(path_from_uri, storage->compression_method); auto impl = std::make_unique( uri_without_path, path_from_uri, getContext()->getGlobalContext()->getConfigRef(), getContext()->getReadSettings()); @@ -553,8 +580,8 @@ Pipe StorageHDFS::read( if (distributed_processing) { iterator_wrapper = std::make_shared( - [callback = context_->getReadTaskCallback()]() -> String { - return callback(); + [callback = context_->getReadTaskCallback()]() -> StorageHDFS::PathWithInfo { + return StorageHDFS::PathWithInfo{callback(), std::nullopt}; }); } else if (is_path_with_globs) @@ -761,24 +788,22 @@ SchemaCache & StorageHDFS::getSchemaCache(const ContextPtr & ctx) } std::optional StorageHDFS::tryGetColumnsFromCache( - const Strings & paths, + const std::vector & paths_with_info, const String & uri_without_path, - std::unordered_map & last_mod_time, const String & format_name, const ContextPtr & ctx) { auto & schema_cache = getSchemaCache(ctx); - for (const auto & path : paths) + for (const auto & path_with_info : paths_with_info) { auto get_last_mod_time = [&]() -> std::optional { - auto it = last_mod_time.find(path); - if (it == last_mod_time.end()) - return std::nullopt; - return it->second; + if (path_with_info.info) + return path_with_info.info->last_mod_time; + return std::nullopt; }; - String url = fs::path(uri_without_path) / path; + String url = fs::path(uri_without_path) / path_with_info.path; auto cache_key = getKeyForSchemaCache(url, format_name, {}, ctx); auto columns = schema_cache.tryGet(cache_key, get_last_mod_time); if (columns) @@ -789,7 +814,7 @@ std::optional StorageHDFS::tryGetColumnsFromCache( } void StorageHDFS::addColumnsToCache( - const Strings & paths, + const std::vector & paths_with_info, const String & uri_without_path, const ColumnsDescription & columns, const String & format_name, @@ -797,8 +822,8 @@ void StorageHDFS::addColumnsToCache( { auto & schema_cache = getSchemaCache(ctx); Strings sources; - sources.reserve(paths.size()); - std::transform(paths.begin(), paths.end(), std::back_inserter(sources), [&](const String & path){ return fs::path(uri_without_path) / path; }); + sources.reserve(paths_with_info.size()); + std::transform(paths_with_info.begin(), paths_with_info.end(), std::back_inserter(sources), [&](const PathWithInfo & path_with_info){ return fs::path(uri_without_path) / path_with_info.path; }); auto cache_keys = getKeysForSchemaCache(sources, format_name, {}, ctx); schema_cache.addMany(cache_keys, columns); } diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index b123834e981..87ad5aee6a3 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -18,6 +18,18 @@ namespace DB class StorageHDFS final : public IStorage, WithContext { public: + struct PathInfo + { + time_t last_mod_time; + size_t size; + }; + + struct PathWithInfo + { + String path; + std::optional info; + }; + StorageHDFS( const String & uri_, const StorageID & table_id_, @@ -72,14 +84,13 @@ protected: private: static std::optional tryGetColumnsFromCache( - const Strings & paths, + const std::vector & paths_with_info, const String & uri_without_path, - std::unordered_map & last_mod_time, const String & format_name, const ContextPtr & ctx); static void addColumnsToCache( - const Strings & paths, + const std::vector & paths, const String & uri_without_path, const ColumnsDescription & columns, const String & format_name, @@ -105,7 +116,7 @@ public: { public: DisclosedGlobIterator(ContextPtr context_, const String & uri_); - String next(); + StorageHDFS::PathWithInfo next(); private: class Impl; /// shared_ptr to have copy constructor @@ -116,14 +127,14 @@ public: { public: URISIterator(const std::vector & uris_, ContextPtr context); - String next(); + StorageHDFS::PathWithInfo next(); private: class Impl; /// shared_ptr to have copy constructor std::shared_ptr pimpl; }; - using IteratorWrapper = std::function; + using IteratorWrapper = std::function; using StorageHDFSPtr = std::shared_ptr; static Block getHeader(Block sample_block, const std::vector & requested_virtual_columns); diff --git a/src/Storages/HDFS/StorageHDFSCluster.cpp b/src/Storages/HDFS/StorageHDFSCluster.cpp index 46e67b623e2..b98459aeee3 100644 --- a/src/Storages/HDFS/StorageHDFSCluster.cpp +++ b/src/Storages/HDFS/StorageHDFSCluster.cpp @@ -79,7 +79,7 @@ void StorageHDFSCluster::addColumnsStructureToQuery(ASTPtr & query, const String RemoteQueryExecutor::Extension StorageHDFSCluster::getTaskIteratorExtension(ASTPtr, const ContextPtr & context) const { auto iterator = std::make_shared(context, uri); - auto callback = std::make_shared([iter = std::move(iterator)]() mutable -> String { return iter->next(); }); + auto callback = std::make_shared>([iter = std::move(iterator)]() mutable -> String { return iter->next().path; }); return RemoteQueryExecutor::Extension{.task_iterator = std::move(callback)}; } diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 647f9511052..a20a4e63ba6 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -257,35 +257,40 @@ std::unique_ptr selectReadBuffer( return res; } -std::unique_ptr createReadBuffer( - const String & current_path, - bool use_table_fd, - const String & storage_name, - int table_fd, - const String & compression_method, - ContextPtr context) +struct stat getFileStat(const String & current_path, bool use_table_fd, int table_fd, const String & storage_name) { - CompressionMethod method; - struct stat file_stat{}; - if (use_table_fd) { /// Check if file descriptor allows random reads (and reading it twice). if (0 != fstat(table_fd, &file_stat)) throwFromErrno("Cannot stat table file descriptor, inside " + storage_name, ErrorCodes::CANNOT_STAT); - - method = chooseCompressionMethod("", compression_method); } else { /// Check if file descriptor allows random reads (and reading it twice). if (0 != stat(current_path.c_str(), &file_stat)) throwFromErrno("Cannot stat file " + current_path, ErrorCodes::CANNOT_STAT); - - method = chooseCompressionMethod(current_path, compression_method); } + return file_stat; +} + +std::unique_ptr createReadBuffer( + const String & current_path, + const struct stat & file_stat, + bool use_table_fd, + int table_fd, + const String & compression_method, + ContextPtr context) +{ + CompressionMethod method; + + if (use_table_fd) + method = chooseCompressionMethod("", compression_method); + else + method = chooseCompressionMethod(current_path, compression_method); + std::unique_ptr nested_buffer = selectReadBuffer(current_path, use_table_fd, table_fd, file_stat, context); /// For clickhouse-local and clickhouse-client add progress callback to display progress bar. @@ -355,7 +360,8 @@ ColumnsDescription StorageFile::getTableStructureFromFileDescriptor(ContextPtr c { /// We will use PeekableReadBuffer to create a checkpoint, so we need a place /// where we can store the original read buffer. - read_buffer_from_fd = createReadBuffer("", true, getName(), table_fd, compression_method, context); + auto file_stat = getFileStat("", true, table_fd, getName()); + read_buffer_from_fd = createReadBuffer("", file_stat, true, table_fd, compression_method, context); auto read_buf = std::make_unique(*read_buffer_from_fd); read_buf->setCheckpoint(); return read_buf; @@ -396,12 +402,24 @@ ColumnsDescription StorageFile::getTableStructureFromFile( if (context->getSettingsRef().schema_inference_use_cache_for_file) columns_from_cache = tryGetColumnsFromCache(paths, format, format_settings, context); - ReadBufferIterator read_buffer_iterator = [&, it = paths.begin()](ColumnsDescription &) mutable -> std::unique_ptr + ReadBufferIterator read_buffer_iterator = [&, it = paths.begin(), first = true](ColumnsDescription & columns) mutable -> std::unique_ptr { if (it == paths.end()) + { + if (first) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because all files are empty. You must specify table structure manually", + format); return nullptr; + } - return createReadBuffer(*it++, false, "File", -1, compression_method, context); + auto path = *it++; + auto file_stat = getFileStat(path, false, -1, "File"); + if (context->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) + return read_buffer_iterator(columns); + + first = false; + return createReadBuffer(path, file_stat, false, -1, compression_method, context); }; ColumnsDescription columns; @@ -628,7 +646,12 @@ public: } if (!read_buf) - read_buf = createReadBuffer(current_path, storage->use_table_fd, storage->getName(), storage->table_fd, storage->compression_method, context); + { + auto file_stat = getFileStat(current_path, storage->use_table_fd, storage->table_fd, storage->getName()); + if (context->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) + continue; + read_buf = createReadBuffer(current_path, file_stat, storage->use_table_fd, storage->table_fd, storage->compression_method, context); + } const Settings & settings = context->getSettingsRef(); chassert(!storage->paths.empty()); diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 2a2192d9cfe..9c4791020f2 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -573,6 +573,11 @@ StorageS3Source::ReaderHolder StorageS3Source::createReader() return {}; size_t object_size = info ? info->size : S3::getObjectSize(*client, bucket, current_key, version_id, request_settings); + + /// If object is empty and s3_skip_empty_files=1, skip it and go to the next key. + if (getContext()->getSettingsRef().s3_skip_empty_files && object_size == 0) + return createReader(); + auto compression_method = chooseCompressionMethod(current_key, compression_hint); InputFormatPtr input_format; @@ -1456,7 +1461,7 @@ ColumnsDescription StorageS3::getTableStructureFromDataImpl( ReadBufferIterator read_buffer_iterator = [&, first = true](ColumnsDescription & cached_columns) mutable -> std::unique_ptr { - auto [key, _] = (*file_iterator)(); + auto [key, info] = (*file_iterator)(); if (key.empty()) { @@ -1464,11 +1469,14 @@ ColumnsDescription StorageS3::getTableStructureFromDataImpl( throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file, because there are no files with provided path " - "in S3. You must specify table structure manually", configuration.format); + "in S3 or all files are empty. You must specify table structure manually", configuration.format); return nullptr; } + if (ctx->getSettingsRef().s3_skip_empty_files && info->size == 0) + return read_buffer_iterator(cached_columns); + /// S3 file iterator could get new keys after new iteration, check them in schema cache. if (ctx->getSettingsRef().schema_inference_use_cache_for_s3 && read_keys.size() > prev_read_keys_size) { diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index da8f6a151b2..706ce481a24 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -49,6 +49,7 @@ namespace ErrorCodes extern const int NETWORK_ERROR; extern const int BAD_ARGUMENTS; extern const int LOGICAL_ERROR; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } static constexpr auto bad_arguments_error_message = "Storage URL requires 1-4 arguments: " @@ -242,15 +243,16 @@ StorageURLSource::StorageURLSource( auto headers = getHeaders(headers_); /// Lazy initialization. We should not perform requests in constructor, because we need to do it in query pipeline. - initialize = [=, this](const FailoverOptions & uri_options) + initialize = [=, this]() { - if (uri_options.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty url list"); + const auto current_uri_options = (*uri_iterator)(); + if (current_uri_options.empty()) + return false; - auto first_option = uri_options.begin(); + auto first_option = current_uri_options.begin(); auto [actual_uri, buf_factory] = getFirstAvailableURIAndReadBuffer( first_option, - uri_options.end(), + current_uri_options.end(), context, params, http_method, @@ -259,7 +261,11 @@ StorageURLSource::StorageURLSource( credentials, headers, glob_url, - uri_options.size() == 1); + current_uri_options.size() == 1); + + /// If file is empty and engine_url_skip_empty_files=1, skip it and go to the next file. + if (context->getSettingsRef().engine_url_skip_empty_files && buf_factory->getFileSize() == 0) + return initialize(); curr_uri = actual_uri; @@ -292,6 +298,7 @@ StorageURLSource::StorageURLSource( pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); reader = std::make_unique(*pipeline); + return true; }; } @@ -306,14 +313,8 @@ Chunk StorageURLSource::generate() break; } - if (!reader) - { - auto current_uri = (*uri_iterator)(); - if (current_uri.empty()) - return {}; - - initialize(current_uri); - } + if (!reader && !initialize()) + return {}; Chunk chunk; if (reader->pull(chunk)) @@ -592,10 +593,16 @@ ColumnsDescription IStorageURLBase::getTableStructureFromData( if (context->getSettingsRef().schema_inference_use_cache_for_url) columns_from_cache = tryGetColumnsFromCache(urls_to_check, headers, credentials, format, format_settings, context); - ReadBufferIterator read_buffer_iterator = [&, it = urls_to_check.cbegin()](ColumnsDescription &) mutable -> std::unique_ptr + ReadBufferIterator read_buffer_iterator = [&, it = urls_to_check.cbegin(), first = true](ColumnsDescription & columns) mutable -> std::unique_ptr { if (it == urls_to_check.cend()) + { + if (first) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because all files are empty. " + "You must specify table structure manually", format); return nullptr; + } auto [_, buf_factory] = StorageURLSource::getFirstAvailableURIAndReadBuffer( it, @@ -609,7 +616,13 @@ ColumnsDescription IStorageURLBase::getTableStructureFromData( headers, false, false); + ++it; + + if (context->getSettingsRef().engine_url_skip_empty_files && buf_factory->getFileSize() == 0) + return read_buffer_iterator(columns); + + first = false; return wrapReadBufferWithCompressionMethod( buf_factory->getReader(), compression_method, diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index d53b72105e4..4cd4b66e69a 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -197,7 +197,7 @@ public: bool delay_initialization); private: - using InitializeFunc = std::function; + using InitializeFunc = std::function; InitializeFunc initialize; String name; diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index edf5344e887..5ac1d3bea6f 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -816,6 +816,56 @@ def test_hdfsCluster_unset_skip_unavailable_shards(started_cluster): ) +def test_skip_empty_files(started_cluster): + node = started_cluster.instances["node1"] + + node.query( + f"insert into function hdfs('hdfs://hdfs1:9000/skip_empty_files1.parquet', TSVRaw) select * from numbers(0) settings hdfs_truncate_on_insert=1" + ) + + node.query( + f"insert into function hdfs('hdfs://hdfs1:9000/skip_empty_files2.parquet') select * from numbers(1) settings hdfs_truncate_on_insert=1" + ) + + node.query_and_get_error( + f"select * from hdfs('hdfs://hdfs1:9000/skip_empty_files1.parquet') settings hdfs_skip_empty_files=0" + ) + + node.query_and_get_error( + f"select * from hdfs('hdfs://hdfs1:9000/skip_empty_files1.parquet', auto, 'number UINt64') settings hdfs_skip_empty_files=0" + ) + + node.query_and_get_error( + f"select * from hdfs('hdfs://hdfs1:9000/skip_empty_files1.parquet') settings hdfs_skip_empty_files=1" + ) + + res = node.query( + f"select * from hdfs('hdfs://hdfs1:9000/skip_empty_files1.parquet', auto, 'number UInt64') settings hdfs_skip_empty_files=1" + ) + + assert len(res) == 0 + + node.query_and_get_error( + f"select * from hdfs('hdfs://hdfs1:9000/skip_empty_files*.parquet') settings hdfs_skip_empty_files=0" + ) + + node.query_and_get_error( + f"select * from hdfs('hdfs://hdfs1:9000/skip_empty_files*.parquet', auto, 'number UInt64') settings hdfs_skip_empty_files=0" + ) + + res = node.query( + f"select * from hdfs('hdfs://hdfs1:9000/skip_empty_files*.parquet') settings hdfs_skip_empty_files=1" + ) + + assert int(res) == 0 + + res = node.query( + f"select * from hdfs('hdfs://hdfs1:9000/skip_empty_files*.parquet', auto, 'number UInt64') settings hdfs_skip_empty_files=1" + ) + + assert int(res) == 0 + + if __name__ == "__main__": cluster.start() input("Cluster created, press any key to destroy...") diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index d9ac70f51ad..516c8ed152a 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -1713,3 +1713,58 @@ def test_s3_list_objects_failure(started_cluster): assert ei.value.returncode == 243 assert "Could not list objects" in ei.value.stderr + + +def test_skip_empty_files(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + + instance.query( + f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet', TSVRaw) select * from numbers(0) settings s3_truncate_on_insert=1" + ) + + instance.query( + f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files2.parquet') select * from numbers(1) settings s3_truncate_on_insert=1" + ) + def test(engine, setting): + instance.query_and_get_error( + f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet') settings {setting}=0" + ) + + instance.query_and_get_error( + f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet', auto, 'number UINt64') settings {setting}=0" + ) + + instance.query_and_get_error( + f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet') settings {setting}=1" + ) + + res = instance.query( + f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet', auto, 'number UInt64') settings {setting}=1" + ) + + assert len(res) == 0 + + instance.query_and_get_error( + f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1,2}}.parquet') settings {setting}=0" + ) + + instance.query_and_get_error( + f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1,2}}.parquet', auto, 'number UInt64') settings {setting}=0" + ) + + res = instance.query( + f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1,2}}.parquet') settings {setting}=1" + ) + + assert int(res) == 0 + + res = instance.query( + f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1,2}}.parquet', auto, 'number UInt64') settings {setting}=1" + ) + + assert int(res) == 0 + + test("s3", "s3_skip_empty_files") + test("url", "engine_url_skip_empty_files") diff --git a/tests/queries/0_stateless/02771_skip_empty_files.reference b/tests/queries/0_stateless/02771_skip_empty_files.reference new file mode 100644 index 00000000000..83f2e99acd0 --- /dev/null +++ b/tests/queries/0_stateless/02771_skip_empty_files.reference @@ -0,0 +1,7 @@ +1 +1 +1 +1 +1 +0 +0 diff --git a/tests/queries/0_stateless/02771_skip_empty_files.sh b/tests/queries/0_stateless/02771_skip_empty_files.sh new file mode 100755 index 00000000000..99f43d7868a --- /dev/null +++ b/tests/queries/0_stateless/02771_skip_empty_files.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +FILE_PREFIX=$CLICKHOUSE_TEST_UNIQUE_NAME +touch $FILE_PREFIX-1.parquet +$CLICKHOUSE_LOCAL -q "select * from numbers(1) format Parquet" > $FILE_PREFIX-2.parquet +$CLICKHOUSE_LOCAL -q "select * from file('$FILE_PREFIX-1.parquet') settings engine_file_skip_empty_files=0" 2>&1 | grep -c "CANNOT_EXTRACT_TABLE_STRUCTURE" +$CLICKHOUSE_LOCAL -q "select * from file('$FILE_PREFIX-1.parquet', auto, 'number UInt64') settings engine_file_skip_empty_files=0" 2>&1 | grep -c "Exception" +$CLICKHOUSE_LOCAL -q "select * from file('$FILE_PREFIX-1.parquet') settings engine_file_skip_empty_files=1" 2>&1 | grep -c "Exception" +$CLICKHOUSE_LOCAL -q "select * from file('$FILE_PREFIX-1.parquet', auto, 'number UInt64') settings engine_file_skip_empty_files=1" +$CLICKHOUSE_LOCAL -q "select * from file('$FILE_PREFIX-*.parquet') settings engine_file_skip_empty_files=0" 2>&1 | grep -c "Exception" +$CLICKHOUSE_LOCAL -q "select * from file('$FILE_PREFIX-*.parquet', auto, 'number UInt64') settings engine_file_skip_empty_files=0" 2>&1 | grep -c "Exception" +$CLICKHOUSE_LOCAL -q "select * from file('$FILE_PREFIX-*.parquet') settings engine_file_skip_empty_files=1" +$CLICKHOUSE_LOCAL -q "select * from file('$FILE_PREFIX-*.parquet', auto, 'number UInt64') settings engine_file_skip_empty_files=1" + + + + + + +rm $FILE_PREFIX-* From 38634cc5c5221a6ec646fc11dff34deda7c6b7d2 Mon Sep 17 00:00:00 2001 From: tpanetti Date: Wed, 24 May 2023 13:49:18 +0100 Subject: [PATCH 0174/1072] Convert Clickhouse Types to MySQL types in Compatibility mode This changes MySQL compatibility mode to display MySQL compatible types --- src/DataTypes/DataTypeAggregateFunction.h | 1 + src/DataTypes/DataTypeArray.h | 4 + src/DataTypes/DataTypeDate.h | 1 + src/DataTypes/DataTypeDate32.h | 1 + src/DataTypes/DataTypeDateTime.h | 1 + src/DataTypes/DataTypeDateTime64.h | 1 + src/DataTypes/DataTypeEnum.cpp | 24 ++++ src/DataTypes/DataTypeEnum.h | 3 + src/DataTypes/DataTypeFixedString.h | 1 + src/DataTypes/DataTypeFunction.h | 1 + src/DataTypes/DataTypeIPv4andIPv6.h | 4 + src/DataTypes/DataTypeInterval.h | 1 + src/DataTypes/DataTypeLowCardinality.h | 2 + src/DataTypes/DataTypeMap.h | 1 + src/DataTypes/DataTypeNothing.h | 2 + src/DataTypes/DataTypeNullable.h | 1 + src/DataTypes/DataTypeNumberBase.cpp | 17 +++ src/DataTypes/DataTypeNumberBase.h | 3 + src/DataTypes/DataTypeObject.h | 1 + src/DataTypes/DataTypeSet.h | 2 + src/DataTypes/DataTypeString.h | 3 + src/DataTypes/DataTypeTuple.h | 1 + src/DataTypes/DataTypeUUID.h | 2 + src/DataTypes/DataTypesDecimal.h | 3 + src/DataTypes/IDataType.h | 10 ++ src/Storages/System/StorageSystemColumns.cpp | 15 ++- .../02740_show_columns_mysql_compatibility.sh | 116 ++++++++++++++++++ 27 files changed, 221 insertions(+), 1 deletion(-) create mode 100755 tests/queries/0_stateless/02740_show_columns_mysql_compatibility.sh diff --git a/src/DataTypes/DataTypeAggregateFunction.h b/src/DataTypes/DataTypeAggregateFunction.h index 2d712d9c686..697be13652c 100644 --- a/src/DataTypes/DataTypeAggregateFunction.h +++ b/src/DataTypes/DataTypeAggregateFunction.h @@ -45,6 +45,7 @@ public: String doGetName() const override; String getNameWithoutVersion() const; const char * getFamilyName() const override { return "AggregateFunction"; } + const char * getMySQLName() const override { return "text"; } TypeIndex getTypeId() const override { return TypeIndex::AggregateFunction; } Array getParameters() const { return parameters; } diff --git a/src/DataTypes/DataTypeArray.h b/src/DataTypes/DataTypeArray.h index 033a657c845..35462df9a4e 100644 --- a/src/DataTypes/DataTypeArray.h +++ b/src/DataTypes/DataTypeArray.h @@ -30,6 +30,10 @@ public: { return "Array"; } + const char * getMySQLName() const override + { + return "string"; + } bool canBeInsideNullable() const override { diff --git a/src/DataTypes/DataTypeDate.h b/src/DataTypes/DataTypeDate.h index 2f17207cc07..33bcb6123ff 100644 --- a/src/DataTypes/DataTypeDate.h +++ b/src/DataTypes/DataTypeDate.h @@ -13,6 +13,7 @@ public: TypeIndex getTypeId() const override { return TypeIndex::Date; } const char * getFamilyName() const override { return family_name; } + const char * getMySQLName() const override { return "date"; } bool canBeUsedAsVersion() const override { return true; } bool canBeInsideNullable() const override { return true; } diff --git a/src/DataTypes/DataTypeDate32.h b/src/DataTypes/DataTypeDate32.h index 9160b62dc15..56315f46e8c 100644 --- a/src/DataTypes/DataTypeDate32.h +++ b/src/DataTypes/DataTypeDate32.h @@ -13,6 +13,7 @@ public: TypeIndex getTypeId() const override { return TypeIndex::Date32; } const char * getFamilyName() const override { return family_name; } + const char * getMySQLName() const override { return "date"; } Field getDefault() const override { diff --git a/src/DataTypes/DataTypeDateTime.h b/src/DataTypes/DataTypeDateTime.h index 91a09ff7cb9..c868f92c311 100644 --- a/src/DataTypes/DataTypeDateTime.h +++ b/src/DataTypes/DataTypeDateTime.h @@ -36,6 +36,7 @@ public: static constexpr auto family_name = "DateTime"; const char * getFamilyName() const override { return family_name; } + const char * getMySQLName() const override { return "datetime"; } String doGetName() const override; TypeIndex getTypeId() const override { return TypeIndex::DateTime; } diff --git a/src/DataTypes/DataTypeDateTime64.h b/src/DataTypes/DataTypeDateTime64.h index aaa99485040..8d317bb9430 100644 --- a/src/DataTypes/DataTypeDateTime64.h +++ b/src/DataTypes/DataTypeDateTime64.h @@ -28,6 +28,7 @@ public: DataTypeDateTime64(UInt32 scale_, const TimezoneMixin & time_zone_info); const char * getFamilyName() const override { return family_name; } + const char * getMySQLName() const override { return "datetime"; } std::string doGetName() const override; TypeIndex getTypeId() const override { return type_id; } diff --git a/src/DataTypes/DataTypeEnum.cpp b/src/DataTypes/DataTypeEnum.cpp index 3c3ac2ae4e2..bfed4d4d5a2 100644 --- a/src/DataTypes/DataTypeEnum.cpp +++ b/src/DataTypes/DataTypeEnum.cpp @@ -36,6 +36,29 @@ const char * DataTypeEnum::getFamilyName() const return EnumName::value; } +template +std::string DataTypeEnum::generateMySQLName(const Values & values) +{ + WriteBufferFromOwnString out; + + writeString("enum", out); + writeChar('(', out); + + auto first = true; + for (const auto & name_and_value : values) + { + if (!first) + writeString(", ", out); + + first = false; + + writeQuotedString(name_and_value.first, out); + } + + writeChar(')', out); + + return out.str(); +} template std::string DataTypeEnum::generateName(const Values & values) @@ -67,6 +90,7 @@ template DataTypeEnum::DataTypeEnum(const Values & values_) : EnumValues(values_) , type_name(generateName(this->getValues())) + , my_sql_type_name(generateMySQLName(this->getValues())) { } diff --git a/src/DataTypes/DataTypeEnum.h b/src/DataTypes/DataTypeEnum.h index 2f607fc2aa6..c6e523adf96 100644 --- a/src/DataTypes/DataTypeEnum.h +++ b/src/DataTypes/DataTypeEnum.h @@ -45,13 +45,16 @@ public: private: std::string type_name; + std::string my_sql_type_name; static std::string generateName(const Values & values); + static std::string generateMySQLName(const Values & values); public: explicit DataTypeEnum(const Values & values_); std::string doGetName() const override { return type_name; } const char * getFamilyName() const override; + const char * getMySQLName() const override { return my_sql_type_name.c_str(); } TypeIndex getTypeId() const override { return type_id; } diff --git a/src/DataTypes/DataTypeFixedString.h b/src/DataTypes/DataTypeFixedString.h index 8d114121c1a..eb09914ec9c 100644 --- a/src/DataTypes/DataTypeFixedString.h +++ b/src/DataTypes/DataTypeFixedString.h @@ -42,6 +42,7 @@ public: TypeIndex getTypeId() const override { return type_id; } const char * getFamilyName() const override { return "FixedString"; } + const char * getMySQLName() const override { return "text"; } size_t getN() const { diff --git a/src/DataTypes/DataTypeFunction.h b/src/DataTypes/DataTypeFunction.h index 888bcb6a775..f3423796126 100644 --- a/src/DataTypes/DataTypeFunction.h +++ b/src/DataTypes/DataTypeFunction.h @@ -24,6 +24,7 @@ public: std::string doGetName() const override; const char * getFamilyName() const override { return "Function"; } + const char * getMySQLName() const override { return "text"; } TypeIndex getTypeId() const override { return TypeIndex::Function; } const DataTypes & getArgumentTypes() const diff --git a/src/DataTypes/DataTypeIPv4andIPv6.h b/src/DataTypes/DataTypeIPv4andIPv6.h index ad70bdae933..8f7fe79793b 100644 --- a/src/DataTypes/DataTypeIPv4andIPv6.h +++ b/src/DataTypes/DataTypeIPv4andIPv6.h @@ -19,6 +19,8 @@ public: static constexpr auto type_id = TypeToTypeIndex; const char * getFamilyName() const override { return TypeName.data(); } + const char * getMySQLName() const override { return "text"; } + TypeIndex getTypeId() const override { return type_id; } Field getDefault() const override { return IPv4{}; } @@ -59,6 +61,8 @@ public: static constexpr auto type_id = TypeToTypeIndex; const char * getFamilyName() const override { return TypeName.data(); } + const char * getMySQLName() const override { return "text"; } + TypeIndex getTypeId() const override { return type_id; } Field getDefault() const override { return IPv6{}; } diff --git a/src/DataTypes/DataTypeInterval.h b/src/DataTypes/DataTypeInterval.h index 05abe1d9b24..69a56e8aadd 100644 --- a/src/DataTypes/DataTypeInterval.h +++ b/src/DataTypes/DataTypeInterval.h @@ -26,6 +26,7 @@ public: std::string doGetName() const override { return fmt::format("Interval{}", kind.toString()); } const char * getFamilyName() const override { return "Interval"; } + const char * getMySQLName() const override { return "text"; } TypeIndex getTypeId() const override { return TypeIndex::Interval; } bool equals(const IDataType & rhs) const override; diff --git a/src/DataTypes/DataTypeLowCardinality.h b/src/DataTypes/DataTypeLowCardinality.h index d301a0f5443..6fd4344311c 100644 --- a/src/DataTypes/DataTypeLowCardinality.h +++ b/src/DataTypes/DataTypeLowCardinality.h @@ -22,6 +22,8 @@ public: return "LowCardinality(" + dictionary_type->getName() + ")"; } const char * getFamilyName() const override { return "LowCardinality"; } + const char * getMySQLName() const override { return "text"; } + TypeIndex getTypeId() const override { return TypeIndex::LowCardinality; } MutableColumnPtr createColumn() const override; diff --git a/src/DataTypes/DataTypeMap.h b/src/DataTypes/DataTypeMap.h index 4712f6bbdef..526dc321f44 100644 --- a/src/DataTypes/DataTypeMap.h +++ b/src/DataTypes/DataTypeMap.h @@ -30,6 +30,7 @@ public: TypeIndex getTypeId() const override { return TypeIndex::Map; } std::string doGetName() const override; const char * getFamilyName() const override { return "Map"; } + const char * getMySQLName() const override { return "json"; } bool canBeInsideNullable() const override { return false; } diff --git a/src/DataTypes/DataTypeNothing.h b/src/DataTypes/DataTypeNothing.h index c7d12388de9..fdef6026603 100644 --- a/src/DataTypes/DataTypeNothing.h +++ b/src/DataTypes/DataTypeNothing.h @@ -16,6 +16,8 @@ public: static constexpr bool is_parametric = false; const char * getFamilyName() const override { return "Nothing"; } + const char * getMySQLName() const override { return "text"; } + TypeIndex getTypeId() const override { return TypeIndex::Nothing; } MutableColumnPtr createColumn() const override; diff --git a/src/DataTypes/DataTypeNullable.h b/src/DataTypes/DataTypeNullable.h index 06d46fb15ed..64b201d32b2 100644 --- a/src/DataTypes/DataTypeNullable.h +++ b/src/DataTypes/DataTypeNullable.h @@ -16,6 +16,7 @@ public: explicit DataTypeNullable(const DataTypePtr & nested_data_type_); std::string doGetName() const override { return "Nullable(" + nested_data_type->getName() + ")"; } const char * getFamilyName() const override { return "Nullable"; } + const char * getMySQLName() const override { return nested_data_type->getMySQLName(); } TypeIndex getTypeId() const override { return TypeIndex::Nullable; } MutableColumnPtr createColumn() const override; diff --git a/src/DataTypes/DataTypeNumberBase.cpp b/src/DataTypes/DataTypeNumberBase.cpp index f668a4c522e..cd5e73ac4a1 100644 --- a/src/DataTypes/DataTypeNumberBase.cpp +++ b/src/DataTypes/DataTypeNumberBase.cpp @@ -30,6 +30,23 @@ bool DataTypeNumberBase::isValueRepresentedByUnsignedInteger() const return is_integer && is_unsigned_v; } +template +const std::map DataTypeNumberBase::mysqlTypeMap = { + {"UInt8", "tinyint unsigned"}, + {"UInt16", "smallint unsigned"}, + {"UInt32", "mediumint unsigned"}, + {"UInt64", "bigint unsigned"}, + {"UInt128", "bigint unsigned"}, + {"UInt256", "bigint unsigned"}, + {"Int8", "tinyint"}, + {"Int16", "smallint"}, + {"Int32", "int"}, + {"Int64", "bigint"}, + {"Int128", "bigint"}, + {"Int256", "bigint"}, + {"Float32", "float"}, + {"Float64", "double"}, +}; /// Explicit template instantiations - to avoid code bloat in headers. template class DataTypeNumberBase; diff --git a/src/DataTypes/DataTypeNumberBase.h b/src/DataTypes/DataTypeNumberBase.h index 3a5b11c5124..b5c963cf245 100644 --- a/src/DataTypes/DataTypeNumberBase.h +++ b/src/DataTypes/DataTypeNumberBase.h @@ -20,11 +20,14 @@ public: static constexpr bool is_parametric = false; static constexpr auto family_name = TypeName; static constexpr auto type_id = TypeToTypeIndex; + // Create a map from the name of the type to the name of the type in MySQL. + static const std::map mysqlTypeMap; using FieldType = T; using ColumnType = ColumnVector; const char * getFamilyName() const override { return TypeName.data(); } + const char * getMySQLName() const override { return mysqlTypeMap.at(TypeName.data()).c_str(); } TypeIndex getTypeId() const override { return TypeToTypeIndex; } Field getDefault() const override; diff --git a/src/DataTypes/DataTypeObject.h b/src/DataTypes/DataTypeObject.h index 937a9091371..8a2c36abcd7 100644 --- a/src/DataTypes/DataTypeObject.h +++ b/src/DataTypes/DataTypeObject.h @@ -23,6 +23,7 @@ public: DataTypeObject(const String & schema_format_, bool is_nullable_); const char * getFamilyName() const override { return "Object"; } + const char * getMySQLName() const override { return "json"; } String doGetName() const override; TypeIndex getTypeId() const override { return TypeIndex::Object; } diff --git a/src/DataTypes/DataTypeSet.h b/src/DataTypes/DataTypeSet.h index 7ddfeb9fe30..bdad638b5d5 100644 --- a/src/DataTypes/DataTypeSet.h +++ b/src/DataTypes/DataTypeSet.h @@ -15,6 +15,8 @@ class DataTypeSet final : public IDataTypeDummy public: static constexpr bool is_parametric = true; const char * getFamilyName() const override { return "Set"; } + const char * getMySQLName() const override { return "text"; } + TypeIndex getTypeId() const override { return TypeIndex::Set; } bool equals(const IDataType & rhs) const override { return typeid(rhs) == typeid(*this); } bool isParametric() const override { return true; } diff --git a/src/DataTypes/DataTypeString.h b/src/DataTypes/DataTypeString.h index 5f3bde43a13..3ac739fe68c 100644 --- a/src/DataTypes/DataTypeString.h +++ b/src/DataTypes/DataTypeString.h @@ -21,6 +21,9 @@ public: return "String"; } + // FIXME: string can contain arbitrary bytes, not only UTF-8 sequences + const char * getMySQLName() const override { return "text"; } + TypeIndex getTypeId() const override { return type_id; } MutableColumnPtr createColumn() const override; diff --git a/src/DataTypes/DataTypeTuple.h b/src/DataTypes/DataTypeTuple.h index 152f21015f5..d264cc97f60 100644 --- a/src/DataTypes/DataTypeTuple.h +++ b/src/DataTypes/DataTypeTuple.h @@ -33,6 +33,7 @@ public: TypeIndex getTypeId() const override { return TypeIndex::Tuple; } std::string doGetName() const override; const char * getFamilyName() const override { return "Tuple"; } + const char * getMySQLName() const override { return "json"; } bool canBeInsideNullable() const override { return false; } bool supportsSparseSerialization() const override { return true; } diff --git a/src/DataTypes/DataTypeUUID.h b/src/DataTypes/DataTypeUUID.h index af9f1f35ca5..4d54db42b45 100644 --- a/src/DataTypes/DataTypeUUID.h +++ b/src/DataTypes/DataTypeUUID.h @@ -18,6 +18,8 @@ public: static constexpr auto type_id = TypeIndex::UUID; const char * getFamilyName() const override { return "UUID"; } + const char * getMySQLName() const override { return "char"; } + TypeIndex getTypeId() const override { return type_id; } Field getDefault() const override; diff --git a/src/DataTypes/DataTypesDecimal.h b/src/DataTypes/DataTypesDecimal.h index 583f7ea804a..5c9405cb060 100644 --- a/src/DataTypes/DataTypesDecimal.h +++ b/src/DataTypes/DataTypesDecimal.h @@ -37,8 +37,11 @@ public: using Base::Base; static constexpr auto family_name = "Decimal"; + static constexpr auto mysql_name = "decimal"; const char * getFamilyName() const override { return family_name; } + const char * getMySQLName() const override { return mysql_name; } + std::string doGetName() const override; TypeIndex getTypeId() const override { return TypeToTypeIndex; } bool canBePromoted() const override { return true; } diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index 7cc18fea00c..2bed18897ce 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -71,10 +71,19 @@ public: return doGetName(); } + /// MySQL equivalent Name of data type (examples: UInt64, Array(String)). + String getMySQLTypeName() const + { + if (custom_name) + return custom_name->getName(); + else + return doGetMySQLName(); + } DataTypePtr getPtr() const { return shared_from_this(); } /// Name of data type family (example: FixedString, Array). virtual const char * getFamilyName() const = 0; + virtual const char * getMySQLName() const = 0; /// Data type id. It's used for runtime type checks. virtual TypeIndex getTypeId() const = 0; @@ -126,6 +135,7 @@ public: protected: virtual String doGetName() const { return getFamilyName(); } + virtual String doGetMySQLName() const { return getMySQLName(); } virtual SerializationPtr doGetDefaultSerialization() const = 0; public: diff --git a/src/Storages/System/StorageSystemColumns.cpp b/src/Storages/System/StorageSystemColumns.cpp index 18e7d269795..f391a392dbb 100644 --- a/src/Storages/System/StorageSystemColumns.cpp +++ b/src/Storages/System/StorageSystemColumns.cpp @@ -74,6 +74,7 @@ public: : ISource(header_) , columns_mask(std::move(columns_mask_)), max_block_size(max_block_size_) , databases(std::move(databases_)), tables(std::move(tables_)), storages(std::move(storages_)) + , clientInfo(context->getClientInfo()) , total_tables(tables->size()), access(context->getAccess()) , query_id(context->getCurrentQueryId()), lock_acquire_timeout(context->getSettingsRef().lock_acquire_timeout) { @@ -129,6 +130,17 @@ protected: bool check_access_for_columns = check_access_for_tables && !access->isGranted(AccessType::SHOW_COLUMNS, database_name, table_name); + auto get_type_name = [this](const IDataType& type) -> std::string + { + if (clientInfo.interface == DB::ClientInfo::Interface::MYSQL) + { + return type.getMySQLTypeName(); + } + else + { + return type.getName(); + } + }; size_t position = 0; for (const auto & column : columns) { @@ -146,7 +158,7 @@ protected: if (columns_mask[src_index++]) res_columns[res_index++]->insert(column.name); if (columns_mask[src_index++]) - res_columns[res_index++]->insert(column.type->getName()); + res_columns[res_index++]->insert(get_type_name(*column.type)); if (columns_mask[src_index++]) res_columns[res_index++]->insert(position); @@ -281,6 +293,7 @@ private: ColumnPtr databases; ColumnPtr tables; Storages storages; + ClientInfo clientInfo; size_t db_table_num = 0; size_t total_tables; std::shared_ptr access; diff --git a/tests/queries/0_stateless/02740_show_columns_mysql_compatibility.sh b/tests/queries/0_stateless/02740_show_columns_mysql_compatibility.sh new file mode 100755 index 00000000000..7f828d35679 --- /dev/null +++ b/tests/queries/0_stateless/02740_show_columns_mysql_compatibility.sh @@ -0,0 +1,116 @@ +#!/bin/bash + +# This script tests the MySQL compatibility of the SHOW COLUMNS command in ClickHouse +USER="default" +PASSWORD="" +HOST="127.0.0.1" +PORT=9004 + +# First run the clickhouse test to create the ClickHouse Tables + +echo "Drop tables if they exist" +${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS tab" +${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS database_123456789abcde" +${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS database_123456789abcde.tab" + +echo "Create tab table " +${CLICKHOUSE_LOCAL} --query " + CREATE TABLE tab + ( + uint64 UInt64, + int32 Nullable(Int32), + float32 Float32, + float64 Float64, + decimal_value Decimal(10, 2), + boolean_value UInt8, -- Use 0 for false, 1 for true + string_value String, + fixed_string_value FixedString(10), + date_value Date, + date32_value Date32, + datetime_value DateTime, + datetime64_value DateTime64(3), + json_value String, -- Store JSON as a string + uuid_value UUID, + enum_value Enum8('apple' = 1, 'banana' = 2, 'orange' = 3), + low_cardinality LowCardinality(String), + array_value Array(Int32), + map_value Map(String, Int32), + tuple_value Tuple(Int32, String), + nullable_value Nullable(Int32), + ipv4_value IPv4, + ipv6_value IPv6, + nested Nested + ( + nested_int Int32, + nested_string String + ) + ) ENGINE = MergeTree + ORDER BY uint64; + " + + +echo "Create pseudo-random database name" +${CLICKHOUSE_LOCAL} --query "CREATE DATABASE database_123456789abcde;" + +echo "Create tab duplicate table" +${CLICKHOUSE_LOCAL} --query " + CREATE TABLE database_123456789abcde.tab + ( + uint64 UInt64, + int32 Nullable(Int32), + float32 Float32, + float64 Float64, + decimal_value Decimal(10, 2), + boolean_value UInt8, -- Use 0 for false, 1 for true + string_value String, + fixed_string_value FixedString(10), + date_value Date, + date32_value Date32, + datetime_value DateTime, + datetime64_value DateTime64(3), + json_value String, -- Store JSON as a string + uuid_value UUID, + enum_value Enum8('apple' = 1, 'banana' = 2, 'orange' = 3), + low_cardinality LowCardinality(String), + array_value Array(Int32), + map_value Map(String, Int32), + tuple_value Tuple(Int32, String), + nullable_value Nullable(Int32), + ipv4_value IPv4, + ipv6_value IPv6, + nested Nested + ( + nested_int Int32, + nested_string String + ) + ) ENGINE = MergeTree + ORDER BY uint64; + " + +# Write sql to temp file +TEMP_FILE=$(mktemp) + +cat < $TEMP_FILE +SHOW COLUMNS FROM tab; +SHOW EXTENDED COLUMNS FROM tab; +SHOW FULL COLUMNS FROM tab; +SHOW COLUMNS FROM tab LIKE '%int%'; +SHOW COLUMNS FROM tab NOT LIKE '%int%'; +SHOW COLUMNS FROM tab ILIKE '%INT%'; +SHOW COLUMNS FROM tab NOT ILIKE '%INT%'; +SHOW COLUMNS FROM tab WHERE field LIKE '%int%'; +SHOW COLUMNS FROM tab LIMIT 1; +SHOW COLUMNS FROM tab; +SHOW COLUMNS FROM tab FROM database_123456789abcde; +SHOW COLUMNS FROM database_123456789abcde.tab; +DROP DATABASE database_123456789abcde; +DROP TABLE tab; +EOT + +# Now run the MySQL test script on the ClickHouse DB +echo "Run MySQL test" +mysql --user="$USER" --password="$PASSWORD" --host="$HOST" --port="$PORT" < $TEMP_FILE + +# Clean up the temp file +rm $TEMP_FILE + From bd5a1ae2b97b66361e5b958811a6055f8f5cd2ae Mon Sep 17 00:00:00 2001 From: tpanetti Date: Tue, 30 May 2023 13:32:33 -0700 Subject: [PATCH 0175/1072] Revert "Change SHOW COLUMNS query to display MySQL types in MySQL Compatibility mode" This reverts commit ddbad79c5e67518acebbacaad5be0cad3967ac67. --- .../InterpreterShowColumnsQuery.cpp | 76 +------ .../InterpreterShowColumnsQuery.h | 1 - ...show_columns_mysql_compatibility.reference | 213 ------------------ .../02726_show_columns_mysql_compatibility.sh | 115 ---------- 4 files changed, 3 insertions(+), 402 deletions(-) delete mode 100644 tests/queries/0_stateless/02726_show_columns_mysql_compatibility.reference delete mode 100755 tests/queries/0_stateless/02726_show_columns_mysql_compatibility.sh diff --git a/src/Interpreters/InterpreterShowColumnsQuery.cpp b/src/Interpreters/InterpreterShowColumnsQuery.cpp index 0ad93e37b58..c86d3c753c4 100644 --- a/src/Interpreters/InterpreterShowColumnsQuery.cpp +++ b/src/Interpreters/InterpreterShowColumnsQuery.cpp @@ -42,11 +42,9 @@ SELECT if (default_kind IN ('ALIAS', 'DEFAULT', 'MATERIALIZED'), default_expression, NULL) AS default, '' AS extra )"; - rewritten_query += getMySQLQuery(); - } - else { - rewritten_query += "SELECT name AS field, type AS type, startsWith(type, 'Nullable') AS null, trim(concatWithSeparator(' ', if(is_in_primary_key, 'PRI', ''), if (is_in_sorting_key, 'SOR', ''))) AS key, if(default_kind IN ('ALIAS', 'DEFAULT', 'MATERIALIZED'), default_expression, NULL) AS default, '' AS extra "; - } + // TODO Interpret query.extended. It is supposed to show internal/virtual columns. Need to fetch virtual column names, see + // IStorage::getVirtuals(). We can't easily do that via SQL. + if (query.full) { /// "Full" mode is mostly for MySQL compat @@ -90,74 +88,6 @@ WHERE return rewritten_query; } -String InterpreterShowColumnsQuery::getMySQLQuery() -{ - String mysql_specific_query; - - mysql_specific_query = R"(SELECT name AS field, - CASE - WHEN startsWith(type, 'Nullable') THEN - CASE - WHEN substring(type, 10, length(type) - 10) IN ('UInt8', 'Int8') THEN 'tinyint' - WHEN substring(type, 10, length(type) - 10) IN ('UInt16', 'Int16') THEN 'smallint' - WHEN substring(type, 10, length(type) - 10) IN ('UInt32', 'Int32') THEN 'int' - WHEN substring(type, 10, length(type) - 10) IN ('UInt64', 'Int64', 'UInt128', 'Int128', 'UInt256', 'Int256') THEN 'bigint' - WHEN substring(type, 10, length(type) - 10) = 'Float32' THEN 'float' - WHEN substring(type, 10, length(type) - 10) = 'Float64' THEN 'double' - WHEN substring(type, 10, length(type) - 10) LIKE 'Decimal%' THEN 'decimal' - WHEN substring(type, 10, length(type) - 10) = 'Boolean' THEN 'tinyint' - WHEN substring(type, 10, length(type) - 10) = 'String' THEN 'text' - WHEN substring(type, 10, length(type) - 10) LIKE 'FixedString%' THEN 'text' - WHEN substring(type, 10, length(type) - 10) LIKE 'Date%' THEN 'date' - WHEN substring(type, 10, length(type) - 10) LIKE 'DateTime%' THEN 'datetime' - WHEN substring(type, 10, length(type) - 10) = 'JSON' THEN 'json' - WHEN substring(type, 10, length(type) - 10) = 'UUID' THEN 'binary' - WHEN substring(type, 10, length(type) - 10) LIKE 'Enum%' THEN 'enum' - WHEN substring(type, 10, length(type) - 10) LIKE 'LowCardinality%' THEN 'text' - WHEN substring(type, 10, length(type) - 10) LIKE 'Array%' THEN 'json' - WHEN substring(type, 10, length(type) - 10) LIKE 'Map%' THEN 'json' - WHEN substring(type, 10, length(type) - 10) IN ('SimpleAggregateFunction', 'AggregateFunction') THEN 'text' - WHEN substring(type, 10, length(type) - 10) = 'Nested' THEN 'json' - WHEN substring(type, 10, length(type) - 10) LIKE 'Tuple%' THEN 'json' - WHEN substring(type, 10, length(type) - 10) LIKE 'IPv%' THEN 'text' - WHEN substring(type, 10, length(type) - 10) IN ('Expression', 'Set', 'Nothing', 'Interval') THEN 'text' - ELSE substring(type, 10, length(type) - 10) - END - ELSE - CASE - WHEN type IN ('UInt8', 'Int8') THEN 'tinyint' - WHEN type IN ('UInt16', 'Int16') THEN 'smallint' - WHEN type IN ('UInt32', 'Int32') THEN 'int' - WHEN type IN ('UInt64', 'Int64', 'UInt128', 'Int128', 'UInt256', 'Int256') THEN 'bigint' - WHEN type = 'Float32' THEN 'float' - WHEN type = 'Float64' THEN 'double' - WHEN type LIKE 'Decimal%' THEN 'decimal' - WHEN type = 'Boolean' THEN 'tinyint' - WHEN type = 'String' THEN 'text' - WHEN type LIKE 'FixedString%' THEN 'text' - WHEN type LIKE 'Date%' THEN 'date' - WHEN type LIKE 'DateTime%' THEN 'datetime' - WHEN type = 'JSON' THEN 'json' - WHEN type = 'UUID' THEN 'binary' - WHEN type LIKE 'Enum%' THEN 'enum' - WHEN type LIKE 'LowCardinality%' THEN 'text' - WHEN type LIKE 'Array%' THEN 'json' - WHEN type LIKE 'Map%' THEN 'json' - WHEN type IN ('SimpleAggregateFunction', 'AggregateFunction') THEN 'text' - WHEN type = 'Nested' THEN 'json' - WHEN type LIKE 'Tuple%' THEN 'json' - WHEN type LIKE 'IPv%' THEN 'text' - WHEN type IN ('Expression', 'Set', 'Nothing', 'Interval') THEN 'text' - ELSE type - END - END AS type, - startsWith(type, 'Nullable') AS null, - trim(concatWithSeparator(' ', if(is_in_primary_key, 'PRI', ''), if (is_in_sorting_key, 'SOR', ''))) AS key, - if(default_kind IN ('ALIAS', 'DEFAULT', 'MATERIALIZED'), default_expression, NULL) AS default, - '' AS extra )"; - - return mysql_specific_query.str(); -} BlockIO InterpreterShowColumnsQuery::execute() { diff --git a/src/Interpreters/InterpreterShowColumnsQuery.h b/src/Interpreters/InterpreterShowColumnsQuery.h index b843a163978..ee6dcabd97b 100644 --- a/src/Interpreters/InterpreterShowColumnsQuery.h +++ b/src/Interpreters/InterpreterShowColumnsQuery.h @@ -26,7 +26,6 @@ private: ASTPtr query_ptr; String getRewrittenQuery(); - String getMySQLQuery(); }; diff --git a/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.reference b/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.reference deleted file mode 100644 index c9ad94a34c4..00000000000 --- a/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.reference +++ /dev/null @@ -1,213 +0,0 @@ -Drop tables if they exist -Create tab table -Create pseudo-random database name -Create tab duplicate table -Run MySQL test -field type null key default extra -array_value json 0 NULL -boolean_value tinyint 0 NULL -date32_value date 0 NULL -date_value date 0 NULL -datetime64_value date 0 NULL -datetime_value date 0 NULL -decimal_value decimal 0 NULL -enum_value enum 0 NULL -fixed_string_value text 0 NULL -float32 float 0 NULL -float64 double 0 NULL -int32 int 0 NULL -ipv4_value text 0 NULL -ipv6_value text 0 NULL -json_value text 0 NULL -low_cardinality text 0 NULL -map_value json 0 NULL -nested.nested_int json 0 NULL -nested.nested_string json 0 NULL -nullable_value int 0 NULL -string_value text 0 NULL -tuple_value json 0 NULL -uint64 bigint 0 PRI SOR NULL -uuid_value binary 0 NULL -field type null key default extra -array_value json 0 NULL -boolean_value tinyint 0 NULL -date32_value date 0 NULL -date_value date 0 NULL -datetime64_value date 0 NULL -datetime_value date 0 NULL -decimal_value decimal 0 NULL -enum_value enum 0 NULL -fixed_string_value text 0 NULL -float32 float 0 NULL -float64 double 0 NULL -int32 int 0 NULL -ipv4_value text 0 NULL -ipv6_value text 0 NULL -json_value text 0 NULL -low_cardinality text 0 NULL -map_value json 0 NULL -nested.nested_int json 0 NULL -nested.nested_string json 0 NULL -nullable_value int 0 NULL -string_value text 0 NULL -tuple_value json 0 NULL -uint64 bigint 0 PRI SOR NULL -uuid_value binary 0 NULL -field type null key default extra collation comment privileges -array_value json 0 NULL NULL -boolean_value tinyint 0 NULL NULL -date32_value date 0 NULL NULL -date_value date 0 NULL NULL -datetime64_value date 0 NULL NULL -datetime_value date 0 NULL NULL -decimal_value decimal 0 NULL NULL -enum_value enum 0 NULL NULL -fixed_string_value text 0 NULL NULL -float32 float 0 NULL NULL -float64 double 0 NULL NULL -int32 int 0 NULL NULL -ipv4_value text 0 NULL NULL -ipv6_value text 0 NULL NULL -json_value text 0 NULL NULL -low_cardinality text 0 NULL NULL -map_value json 0 NULL NULL -nested.nested_int json 0 NULL NULL -nested.nested_string json 0 NULL NULL -nullable_value int 0 NULL NULL -string_value text 0 NULL NULL -tuple_value json 0 NULL NULL -uint64 bigint 0 PRI SOR NULL NULL -uuid_value binary 0 NULL NULL -field type null key default extra -int32 int 0 NULL -nested.nested_int json 0 NULL -uint64 bigint 0 PRI SOR NULL -field type null key default extra -array_value json 0 NULL -boolean_value tinyint 0 NULL -date32_value date 0 NULL -date_value date 0 NULL -datetime64_value date 0 NULL -datetime_value date 0 NULL -decimal_value decimal 0 NULL -enum_value enum 0 NULL -fixed_string_value text 0 NULL -float32 float 0 NULL -float64 double 0 NULL -ipv4_value text 0 NULL -ipv6_value text 0 NULL -json_value text 0 NULL -low_cardinality text 0 NULL -map_value json 0 NULL -nested.nested_string json 0 NULL -nullable_value int 0 NULL -string_value text 0 NULL -tuple_value json 0 NULL -uuid_value binary 0 NULL -field type null key default extra -int32 int 0 NULL -nested.nested_int json 0 NULL -uint64 bigint 0 PRI SOR NULL -field type null key default extra -array_value json 0 NULL -boolean_value tinyint 0 NULL -date32_value date 0 NULL -date_value date 0 NULL -datetime64_value date 0 NULL -datetime_value date 0 NULL -decimal_value decimal 0 NULL -enum_value enum 0 NULL -fixed_string_value text 0 NULL -float32 float 0 NULL -float64 double 0 NULL -ipv4_value text 0 NULL -ipv6_value text 0 NULL -json_value text 0 NULL -low_cardinality text 0 NULL -map_value json 0 NULL -nested.nested_string json 0 NULL -nullable_value int 0 NULL -string_value text 0 NULL -tuple_value json 0 NULL -uuid_value binary 0 NULL -field type null key default extra -int32 int 0 NULL -nested.nested_int json 0 NULL -uint64 bigint 0 PRI SOR NULL -field type null key default extra -array_value json 0 NULL -field type null key default extra -array_value json 0 NULL -boolean_value tinyint 0 NULL -date32_value date 0 NULL -date_value date 0 NULL -datetime64_value date 0 NULL -datetime_value date 0 NULL -decimal_value decimal 0 NULL -enum_value enum 0 NULL -fixed_string_value text 0 NULL -float32 float 0 NULL -float64 double 0 NULL -int32 int 0 NULL -ipv4_value text 0 NULL -ipv6_value text 0 NULL -json_value text 0 NULL -low_cardinality text 0 NULL -map_value json 0 NULL -nested.nested_int json 0 NULL -nested.nested_string json 0 NULL -nullable_value int 0 NULL -string_value text 0 NULL -tuple_value json 0 NULL -uint64 bigint 0 PRI SOR NULL -uuid_value binary 0 NULL -field type null key default extra -array_value json 0 NULL -boolean_value tinyint 0 NULL -date32_value date 0 NULL -date_value date 0 NULL -datetime64_value date 0 NULL -datetime_value date 0 NULL -decimal_value decimal 0 NULL -enum_value enum 0 NULL -fixed_string_value text 0 NULL -float32 float 0 NULL -float64 double 0 NULL -int32 int 0 NULL -ipv4_value text 0 NULL -ipv6_value text 0 NULL -json_value text 0 NULL -low_cardinality text 0 NULL -map_value json 0 NULL -nested.nested_int json 0 NULL -nested.nested_string json 0 NULL -nullable_value int 0 NULL -string_value text 0 NULL -tuple_value json 0 NULL -uint64 bigint 0 PRI SOR NULL -uuid_value binary 0 NULL -field type null key default extra -array_value json 0 NULL -boolean_value tinyint 0 NULL -date32_value date 0 NULL -date_value date 0 NULL -datetime64_value date 0 NULL -datetime_value date 0 NULL -decimal_value decimal 0 NULL -enum_value enum 0 NULL -fixed_string_value text 0 NULL -float32 float 0 NULL -float64 double 0 NULL -int32 int 0 NULL -ipv4_value text 0 NULL -ipv6_value text 0 NULL -json_value text 0 NULL -low_cardinality text 0 NULL -map_value json 0 NULL -nested.nested_int json 0 NULL -nested.nested_string json 0 NULL -nullable_value int 0 NULL -string_value text 0 NULL -tuple_value json 0 NULL -uint64 bigint 0 PRI SOR NULL -uuid_value binary 0 NULL diff --git a/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.sh b/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.sh deleted file mode 100755 index 5324496edd3..00000000000 --- a/tests/queries/0_stateless/02726_show_columns_mysql_compatibility.sh +++ /dev/null @@ -1,115 +0,0 @@ -#!/bin/bash - -# This script tests the MySQL compatibility of the SHOW COLUMNS command in ClickHouse -USER="default" -PASSWORD="" -HOST="127.0.0.1" -PORT=9004 - -# First run the clickhouse test to create the ClickHouse Tables - -echo "Drop tables if they exist" -${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS tab" -${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS database_123456789abcde" -${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS database_123456789abcde.tab" - -echo "Create tab table " -${CLICKHOUSE_LOCAL} --query " - CREATE TABLE tab - ( - uint64 UInt64, - int32 Nullable(Int32), - float32 Float32, - float64 Float64, - decimal_value Decimal(10, 2), - boolean_value UInt8, -- Use 0 for false, 1 for true - string_value String, - fixed_string_value FixedString(10), - date_value Date, - date32_value Date32, - datetime_value DateTime, - datetime64_value DateTime64(3), - json_value String, -- Store JSON as a string - uuid_value UUID, - enum_value Enum8('apple' = 1, 'banana' = 2, 'orange' = 3), - low_cardinality LowCardinality(String), - array_value Array(Int32), - map_value Map(String, Int32), - tuple_value Tuple(Int32, String), - nullable_value Nullable(Int32), - ipv4_value IPv4, - ipv6_value IPv6, - nested Nested - ( - nested_int Int32, - nested_string String - ) - ) ENGINE = MergeTree - ORDER BY uint64; - " - - -echo "Create pseudo-random database name" -${CLICKHOUSE_LOCAL} --query "CREATE DATABASE database_123456789abcde;" - -echo "Create tab duplicate table" -${CLICKHOUSE_LOCAL} --query " - CREATE TABLE database_123456789abcde.tab - ( - uint64 UInt64, - int32 Nullable(Int32), - float32 Float32, - float64 Float64, - decimal_value Decimal(10, 2), - boolean_value UInt8, -- Use 0 for false, 1 for true - string_value String, - fixed_string_value FixedString(10), - date_value Date, - date32_value Date32, - datetime_value DateTime, - datetime64_value DateTime64(3), - json_value String, -- Store JSON as a string - uuid_value UUID, - enum_value Enum8('apple' = 1, 'banana' = 2, 'orange' = 3), - low_cardinality LowCardinality(String), - array_value Array(Int32), - map_value Map(String, Int32), - tuple_value Tuple(Int32, String), - nullable_value Nullable(Int32), - ipv4_value IPv4, - ipv6_value IPv6, - nested Nested - ( - nested_int Int32, - nested_string String - ) - ) ENGINE = MergeTree - ORDER BY uint64; - " - -# Write sql to temp file -TEMP_FILE=$(mktemp) - -cat < $TEMP_FILE -SHOW COLUMNS FROM tab; -SHOW EXTENDED COLUMNS FROM tab; -SHOW FULL COLUMNS FROM tab; -SHOW COLUMNS FROM tab LIKE '%int%'; -SHOW COLUMNS FROM tab NOT LIKE '%int%'; -SHOW COLUMNS FROM tab ILIKE '%INT%'; -SHOW COLUMNS FROM tab NOT ILIKE '%INT%'; -SHOW COLUMNS FROM tab WHERE field LIKE '%int%'; -SHOW COLUMNS FROM tab LIMIT 1; -SHOW COLUMNS FROM tab; -SHOW COLUMNS FROM tab FROM database_123456789abcde; -SHOW COLUMNS FROM database_123456789abcde.tab; -DROP DATABASE database_123456789abcde; -DROP TABLE tab; -EOT - -# Now run the MySQL test script on the ClickHouse DB -echo "Run MySQL test" -mysql --user="$USER" --password="$PASSWORD" --host="$HOST" --port="$PORT" < $TEMP_FILE - -# Clean up the temp file -rm $TEMP_FILE From dbd3766f5f662e3338c30fd5408e4818598b9660 Mon Sep 17 00:00:00 2001 From: pufit Date: Sat, 27 May 2023 01:33:07 -0400 Subject: [PATCH 0176/1072] Specify roles in users.xml --- src/Access/UsersConfigAccessStorage.cpp | 159 +++++++++++++----- src/Parsers/Access/ParserGrantQuery.h | 2 +- .../configs/another_user.xml | 25 +++ .../test_user_grants_from_config/test.py | 18 +- 4 files changed, 163 insertions(+), 41 deletions(-) diff --git a/src/Access/UsersConfigAccessStorage.cpp b/src/Access/UsersConfigAccessStorage.cpp index df0e4584709..187258d0fcd 100644 --- a/src/Access/UsersConfigAccessStorage.cpp +++ b/src/Access/UsersConfigAccessStorage.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -13,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -52,11 +54,64 @@ namespace UUID generateID(const IAccessEntity & entity) { return generateID(entity.getType(), entity.getName()); } + template + void parseGrant(T & entity, const String & string_query, const std::unordered_set & allowed_role_ids) + { + ParserGrantQuery parser; + parser.setParseWithoutGrantees(); + + String error_message; + const char * pos = string_query.data(); + auto ast = tryParseQuery(parser, pos, pos + string_query.size(), error_message, false, "", false, 0, 0); + + if (!ast) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Failed to parse grant query. Error: {}", error_message); + + auto & query = ast->as(); + + if (query.roles && query.is_revoke) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Roles can't be revoked in config file"); + + if (!query.cluster.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Can't grant on cluster using config file"); + + if (query.grantees) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "You can't specify grantees in query using config file"); + + for (auto & element : query.access_rights_elements) + { + if (query.is_revoke) + entity.access.revoke(element); + else + entity.access.grant(element); + } + + if (query.roles) + { + std::vector roles_to_grant; + roles_to_grant.reserve(query.roles->size()); + + for (const auto & role_name : query.roles->names) + { + auto role_id = generateID(AccessEntityType::ROLE, role_name); + if (!allowed_role_ids.contains(role_id)) + throw Exception(ErrorCodes::THERE_IS_NO_PROFILE, "Role {} was not found", role_name); + + roles_to_grant.push_back(role_id); + } + + if (query.admin_option) + entity.granted_roles.grantWithAdminOption(roles_to_grant); + else + entity.granted_roles.grant(roles_to_grant); + } + } UserPtr parseUser( const Poco::Util::AbstractConfiguration & config, const String & user_name, const std::unordered_set & allowed_profile_ids, + const std::unordered_set & allowed_role_ids, bool allow_no_password, bool allow_plaintext_password) { @@ -241,37 +296,8 @@ namespace if (grant_queries) { - ParserGrantQuery parser; - parser.parseWithoutGrantees(); - for (const auto & string_query : *grant_queries) - { - String error_message; - const char * pos = string_query.data(); - auto ast = tryParseQuery(parser, pos, pos + string_query.size(), error_message, false, "", false, 0, 0); - - if (!ast) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Failed to parse grant query. Error: {}", error_message); - - auto & query = ast->as(); - - if (query.roles) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Roles can't be granted in config file"); - - if (!query.cluster.empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Can't grant on cluster using config file"); - - if (query.grantees) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "You can't specify grantees in query using config file"); - - for (auto & element : query.access_rights_elements) - { - if (query.is_revoke) - user->access.revoke(element); - else - user->access.grant(element); - } - } + parseGrant(*user, string_query, allowed_role_ids); } else { @@ -321,6 +347,7 @@ namespace std::vector parseUsers( const Poco::Util::AbstractConfiguration & config, const std::unordered_set & allowed_profile_ids, + const std::unordered_set & allowed_role_ids, bool allow_no_password, bool allow_plaintext_password) { @@ -333,7 +360,7 @@ namespace { try { - users.push_back(parseUser(config, user_name, allowed_profile_ids, allow_no_password, allow_plaintext_password)); + users.push_back(parseUser(config, user_name, allowed_profile_ids, allowed_role_ids, allow_no_password, allow_plaintext_password)); } catch (Exception & e) { @@ -345,6 +372,55 @@ namespace return users; } + RolePtr parseRole( + const Poco::Util::AbstractConfiguration & config, + const String & role_name, + const std::unordered_set & allowed_role_ids) + { + auto role = std::make_shared(); + role->setName(role_name); + String role_config = "roles." + role_name; + + const auto grants_config = role_config + ".grants"; + if (config.has(grants_config)) + { + Poco::Util::AbstractConfiguration::Keys keys; + config.keys(grants_config, keys); + for (const auto & key : keys) + { + const auto query = config.getString(grants_config + "." + key); + parseGrant(*role, query, allowed_role_ids); + } + } + + return role; + } + + std::vector parseRoles( + const Poco::Util::AbstractConfiguration & config, + const std::unordered_set & allowed_role_ids) + { + Poco::Util::AbstractConfiguration::Keys role_names; + config.keys("roles", role_names); + + std::vector roles; + roles.reserve(role_names.size()); + for (const auto & role_name : role_names) + { + try + { + roles.push_back(parseRole(config, role_name, allowed_role_ids)); + } + catch (Exception & e) + { + e.addMessage(fmt::format("while parsing roles '{}' in users configuration file", role_name)); + throw; + } + } + + return roles; + } + QuotaPtr parseQuota(const Poco::Util::AbstractConfiguration & config, const String & quota_name, const std::vector & user_ids) { @@ -635,14 +711,16 @@ namespace return profiles; } - - std::unordered_set getAllowedSettingsProfileIDs(const Poco::Util::AbstractConfiguration & config) + std::unordered_set getAllowedIDs( + const Poco::Util::AbstractConfiguration & config, + const String & configuration_key, + const AccessEntityType type) { - Poco::Util::AbstractConfiguration::Keys profile_names; - config.keys("profiles", profile_names); + Poco::Util::AbstractConfiguration::Keys keys; + config.keys(configuration_key, keys); std::unordered_set ids; - for (const auto & profile_name : profile_names) - ids.emplace(generateID(AccessEntityType::SETTINGS_PROFILE, profile_name)); + for (const auto & key : keys) + ids.emplace(generateID(type, key)); return ids; } } @@ -693,12 +771,13 @@ void UsersConfigAccessStorage::parseFromConfig(const Poco::Util::AbstractConfigu { try { - auto allowed_profile_ids = getAllowedSettingsProfileIDs(config); + auto allowed_profile_ids = getAllowedIDs(config, "profiles", AccessEntityType::SETTINGS_PROFILE); + auto allowed_role_ids = getAllowedIDs(config, "roles", AccessEntityType::ROLE); bool no_password_allowed = access_control.isNoPasswordAllowed(); bool plaintext_password_allowed = access_control.isPlaintextPasswordAllowed(); std::vector> all_entities; - for (const auto & entity : parseUsers(config, allowed_profile_ids, no_password_allowed, plaintext_password_allowed)) + for (const auto & entity : parseUsers(config, allowed_profile_ids, allowed_role_ids, no_password_allowed, plaintext_password_allowed)) all_entities.emplace_back(generateID(*entity), entity); for (const auto & entity : parseQuotas(config)) all_entities.emplace_back(generateID(*entity), entity); @@ -706,6 +785,8 @@ void UsersConfigAccessStorage::parseFromConfig(const Poco::Util::AbstractConfigu all_entities.emplace_back(generateID(*entity), entity); for (const auto & entity : parseSettingsProfiles(config, allowed_profile_ids, access_control)) all_entities.emplace_back(generateID(*entity), entity); + for (const auto & entity : parseRoles(config, allowed_role_ids)) + all_entities.emplace_back(generateID(*entity), entity); memory_storage.setAll(all_entities); } catch (Exception & e) diff --git a/src/Parsers/Access/ParserGrantQuery.h b/src/Parsers/Access/ParserGrantQuery.h index 58c2be433d5..0ecfef916f5 100644 --- a/src/Parsers/Access/ParserGrantQuery.h +++ b/src/Parsers/Access/ParserGrantQuery.h @@ -14,7 +14,7 @@ class ParserGrantQuery : public IParserBase public: ParserGrantQuery & useAttachMode(bool attach_mode_ = true) { attach_mode = attach_mode_; return *this; } - ParserGrantQuery & parseWithoutGrantees(bool allow_no_grantees_ = true) { allow_no_grantees = allow_no_grantees_; return *this; } + ParserGrantQuery & setParseWithoutGrantees(bool allow_no_grantees_ = true) { allow_no_grantees = allow_no_grantees_; return *this; } protected: const char * getName() const override { return "GRANT or REVOKE query"; } diff --git a/tests/integration/test_user_grants_from_config/configs/another_user.xml b/tests/integration/test_user_grants_from_config/configs/another_user.xml index 16c026e81d0..0b0b2473142 100644 --- a/tests/integration/test_user_grants_from_config/configs/another_user.xml +++ b/tests/integration/test_user_grants_from_config/configs/another_user.xml @@ -14,5 +14,30 @@ REVOKE CREATE ON system.* + + + + ::/0 + + default + default + + GRANT admin_role + + + + + + GRANT SHOW ON *.* + REVOKE SHOW ON system.* + GRANT CREATE ON *.* WITH GRANT OPTION + + + + + GRANT ALL ON *.* WITH GRANT OPTION + + + diff --git a/tests/integration/test_user_grants_from_config/test.py b/tests/integration/test_user_grants_from_config/test.py index a4d5c0c904b..d2bd0b0facd 100644 --- a/tests/integration/test_user_grants_from_config/test.py +++ b/tests/integration/test_user_grants_from_config/test.py @@ -42,7 +42,7 @@ def test_allow_read_from_system_tables(): ) -def test_grants_from_config(): +def test_user_grants_from_config(): assert node.query("SHOW GRANTS FOR another") == TSV( [ "GRANT SHOW ON *.* TO another", @@ -51,3 +51,19 @@ def test_grants_from_config(): "REVOKE CREATE DATABASE, CREATE TABLE, CREATE VIEW, CREATE DICTIONARY ON system.* FROM another", ] ) + + assert node.query("SHOW GRANTS FOR admin_user") == TSV( + [ + "GRANT admin_role TO admin_user", + ] + ) + + +def test_role_grants_from_config(): + assert node.query("SHOW GRANTS FOR test_role") == TSV( + [ + "GRANT SHOW ON *.* TO test_role", + "GRANT CREATE ON *.* TO test_role WITH GRANT OPTION", + "REVOKE SHOW ON system.* FROM test_role", + ] + ) From 65586c50f500dd5daa51de60087c54acfdf5f914 Mon Sep 17 00:00:00 2001 From: Manas Alekar Date: Wed, 31 May 2023 00:21:13 -0700 Subject: [PATCH 0177/1072] Minor improvements in CGroup awareness. 1. Support CGroup2 in getMemoryAmountOrZero(). 2. Report CFS period and quota in asynchronous metric log. --- base/base/getMemoryAmount.cpp | 22 +++++++++--- src/Common/AsynchronousMetrics.cpp | 56 ++++++++++++++++++++++++++++-- src/Common/AsynchronousMetrics.h | 3 ++ 3 files changed, 75 insertions(+), 6 deletions(-) diff --git a/base/base/getMemoryAmount.cpp b/base/base/getMemoryAmount.cpp index 9e1d2ac3279..6a5470a0549 100644 --- a/base/base/getMemoryAmount.cpp +++ b/base/base/getMemoryAmount.cpp @@ -28,14 +28,28 @@ uint64_t getMemoryAmountOrZero() #if defined(OS_LINUX) // Try to lookup at the Cgroup limit - std::ifstream cgroup_limit("/sys/fs/cgroup/memory/memory.limit_in_bytes"); - if (cgroup_limit.is_open()) + + // v2 + std::ifstream cgroupv2_limit("/sys/fs/cgroup/memory.max"); + if (cgroupv2_limit.is_open()) { - uint64_t memory_limit = 0; // in case of read error - cgroup_limit >> memory_limit; + uint64_t memory_limit = 0; + cgroupv2_limit >> memory_limit; if (memory_limit > 0 && memory_limit < memory_amount) memory_amount = memory_limit; } + else + { + // v1 + std::ifstream cgroup_limit("/sys/fs/cgroup/memory/memory.limit_in_bytes"); + if (cgroup_limit.is_open()) + { + uint64_t memory_limit = 0; // in case of read error + cgroup_limit >> memory_limit; + if (memory_limit > 0 && memory_limit < memory_amount) + memory_amount = memory_limit; + } + } #endif return memory_amount; diff --git a/src/Common/AsynchronousMetrics.cpp b/src/Common/AsynchronousMetrics.cpp index ac2180103c5..e1e99a3f7c7 100644 --- a/src/Common/AsynchronousMetrics.cpp +++ b/src/Common/AsynchronousMetrics.cpp @@ -69,13 +69,23 @@ AsynchronousMetrics::AsynchronousMetrics( /// CGroups v2 openFileIfExists("/sys/fs/cgroup/memory.max", cgroupmem_limit_in_bytes); - openFileIfExists("/sys/fs/cgroup/memory.current", cgroupmem_usage_in_bytes); + if (cgroupmem_limit_in_bytes) + { + openFileIfExists("/sys/fs/cgroup/memory.current", cgroupmem_usage_in_bytes); + } + openFileIfExists("/sys/fs/cgroup/cpu.max", cgroupcpu_max); /// CGroups v1 if (!cgroupmem_limit_in_bytes) + { openFileIfExists("/sys/fs/cgroup/memory/memory.limit_in_bytes", cgroupmem_limit_in_bytes); - if (!cgroupmem_usage_in_bytes) openFileIfExists("/sys/fs/cgroup/memory/memory.usage_in_bytes", cgroupmem_usage_in_bytes); + } + if (!cgroupcpu_max) + { + openFileIfExists("/sys/fs/cgroup/cpu/cpu.cfs_period_us", cgroupcpu_cfs_period); + openFileIfExists("/sys/fs/cgroup/cpu/cpu.cfs_quota_us", cgroupcpu_cfs_quota); + } openSensors(); openBlockDevices(); @@ -926,6 +936,48 @@ void AsynchronousMetrics::update(TimePoint update_time) tryLogCurrentException(__PRETTY_FUNCTION__); } } + + if (cgroupcpu_max) + { + try { + cgroupcpu_max->rewind(); + + uint64_t quota = 0; + uint64_t period = 0; + + readText(quota, *cgroupcpu_max); + skipWhitespaceIfAny(*cgroupcpu_max); + readText(period, *cgroupcpu_max); + + new_values["CGroupCpuCfsPeriod"] = { period, "The CFS period of CPU cgroup."}; + new_values["CGroupCpuCfsQuota"] = { quota, "The CFS quota of CPU cgroup."}; + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } + else if (cgroupcpu_cfs_quota && cgroupcpu_cfs_period) + { + try { + cgroupcpu_cfs_quota->rewind(); + cgroupcpu_cfs_period->rewind(); + + uint64_t quota = 0; + uint64_t period = 0; + + tryReadText(quota, *cgroupcpu_cfs_quota); + tryReadText(period, *cgroupcpu_cfs_period); + + new_values["CGroupCpuCfsPeriod"] = { period, "The CFS period of CPU cgroup."}; + new_values["CGroupCpuCfsQuota"] = { quota, "The CFS quota of CPU cgroup."}; + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } + if (meminfo) { try diff --git a/src/Common/AsynchronousMetrics.h b/src/Common/AsynchronousMetrics.h index d104b872f52..e3b5142553b 100644 --- a/src/Common/AsynchronousMetrics.h +++ b/src/Common/AsynchronousMetrics.h @@ -110,6 +110,9 @@ private: std::optional cgroupmem_limit_in_bytes; std::optional cgroupmem_usage_in_bytes; + std::optional cgroupcpu_cfs_period; + std::optional cgroupcpu_cfs_quota; + std::optional cgroupcpu_max; std::vector> thermal; From 8ce56dfe4616dcaf2638ffc8f1ef7c35718fea06 Mon Sep 17 00:00:00 2001 From: lihaibo42 Date: Wed, 31 May 2023 17:53:00 +0800 Subject: [PATCH 0178/1072] Link boost::context library to clickhouse_common_io --- src/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 622e18d4ff7..e1359a5a8aa 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -346,6 +346,7 @@ target_link_libraries(clickhouse_common_io PUBLIC boost::program_options boost::system + boost::context ch_contrib::cityhash ch_contrib::re2 ch_contrib::re2_st From e6e420da5517e61f2940cc9b349a62ed192e9822 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 31 May 2023 13:00:55 +0200 Subject: [PATCH 0179/1072] Add no-fasttest tag --- tests/queries/0_stateless/02771_skip_empty_files.sh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/queries/0_stateless/02771_skip_empty_files.sh b/tests/queries/0_stateless/02771_skip_empty_files.sh index 99f43d7868a..2d1dc205dcd 100755 --- a/tests/queries/0_stateless/02771_skip_empty_files.sh +++ b/tests/queries/0_stateless/02771_skip_empty_files.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: no-fasttest CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -16,9 +17,4 @@ $CLICKHOUSE_LOCAL -q "select * from file('$FILE_PREFIX-*.parquet', auto, 'number $CLICKHOUSE_LOCAL -q "select * from file('$FILE_PREFIX-*.parquet') settings engine_file_skip_empty_files=1" $CLICKHOUSE_LOCAL -q "select * from file('$FILE_PREFIX-*.parquet', auto, 'number UInt64') settings engine_file_skip_empty_files=1" - - - - - rm $FILE_PREFIX-* From 444ce60aeb4cda9ac2e5c61bbc0ae03cc76cd2b2 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 31 May 2023 11:50:25 +0000 Subject: [PATCH 0180/1072] Add tests with explicit cast --- .../01746_convert_type_with_default.reference | 47 +++++++++++++----- .../01746_convert_type_with_default.sql | 48 ++++++++++++++++--- 2 files changed, 77 insertions(+), 18 deletions(-) diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.reference b/tests/queries/0_stateless/01746_convert_type_with_default.reference index 235a88157c8..892a12434b9 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.reference +++ b/tests/queries/0_stateless/01746_convert_type_with_default.reference @@ -20,17 +20,40 @@ 2 -1 -2 -1970-01-01 -2023-05-30 -2023-05-30 -2023-05-30 -1970-01-01 -2023-05-30 -2023-05-30 -1970-01-01 -2023-05-30 14:38:20 -2023-05-30 14:38:20 -2023-05-30 14:38:20 -2023-05-30 14:38:20 61f0c404-5cb3-11e7-907b-a6006ad3dba0 59f0c404-5cb3-11e7-907b-a6006ad3dba0 +1970-01-01 +2023-05-30 +2023-05-30 +2023-05-30 +1970-01-01 +1970-01-20 +1970-01-20 +1970-01-20 +1970-01-20 +1970-01-20 +1970-01-20 +1970-01-20 +1970-01-20 +1970-01-20 +1970-01-20 +1970-01-20 +1970-01-20 +2023-05-30 +1970-01-01 +2023-05-30 14:38:20 +2023-05-30 14:38:20 +2023-05-30 14:38:20 +2023-05-30 14:38:20 +1970-01-01 00:00:19 +1970-01-01 00:00:19 +1970-01-01 00:00:19 +1970-01-01 00:00:19 +1970-01-01 00:00:19 +1970-01-01 00:00:19 +1970-01-01 00:00:19 +1970-01-01 00:00:19 +1970-01-01 00:00:19 +1970-01-01 00:00:19 +1970-01-01 00:00:19 +1970-01-01 00:00:19 diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.sql b/tests/queries/0_stateless/01746_convert_type_with_default.sql index 40e4798721b..9fdd92491a7 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.sql +++ b/tests/queries/0_stateless/01746_convert_type_with_default.sql @@ -26,19 +26,55 @@ select toUInt256OrDefault('1xx', cast(2 as UInt256)); select toInt256OrDefault('-1', cast(-2 as Int256)); select toInt256OrDefault('-1xx', cast(-2 as Int256)); -select toDateOrDefault('2020-0x-02'); +SELECT toUUIDOrDefault('61f0c404-5cb3-11e7-907b-a6006ad3dba0', cast('59f0c404-5cb3-11e7-907b-a6006ad3dba0' as UUID)); +SELECT toUUIDOrDefault('-----61f0c404-5cb3-11e7-907b-a6006ad3dba0', cast('59f0c404-5cb3-11e7-907b-a6006ad3dba0' as UUID)); + +select toDateOrDefault('1xxx'); select toDateOrDefault('2023-05-30'); select toDateOrDefault('2023-05-30', '2000-01-01'::Date); -select toDateOrDefault('2020-0x-02', '2023-05-30'::Date); +select toDateOrDefault('1xx', '2023-05-30'::Date); select toDateOrDefault(-1); -select toDateOrDefault(19507); + +select toDateOrDefault(cast(19 as Int8)); +select toDateOrDefault(cast(19 as UInt8)); + +select toDateOrDefault(cast(19 as Int16)); +select toDateOrDefault(cast(19 as UInt16)); + +select toDateOrDefault(cast(19 as Int32)); +select toDateOrDefault(cast(19 as UInt32)); + +select toDateOrDefault(cast(19 as Int64)); +select toDateOrDefault(cast(19 as UInt64)); + +select toDateOrDefault(cast(19 as Int128)); +select toDateOrDefault(cast(19 as UInt128)); + +select toDateOrDefault(cast(19 as Int256)); +select toDateOrDefault(cast(19 as UInt256)); + select toDateOrDefault(19507, '2000-01-01'::Date); select toDateOrDefault(-1, '2000-01-01'::Date); select toDateTimeOrDefault('2023-05-30 14:38:20'); select toDateTimeOrDefault('2023-05-30 14:38:20', 'UTC'); -select toString(toDateTimeOrDefault('s2023', 'Asia/Istanbul', '2023-05-30 14:38:20'::DateTime('Asia/Istanbul')), 'Asia/Istanbul'); +select toString(toDateTimeOrDefault('1xxx', 'Asia/Istanbul', '2023-05-30 14:38:20'::DateTime('Asia/Istanbul')), 'Asia/Istanbul'); select toDateTimeOrDefault(1685457500); -SELECT toUUIDOrDefault('61f0c404-5cb3-11e7-907b-a6006ad3dba0', cast('59f0c404-5cb3-11e7-907b-a6006ad3dba0' as UUID)); -SELECT toUUIDOrDefault('-----61f0c404-5cb3-11e7-907b-a6006ad3dba0', cast('59f0c404-5cb3-11e7-907b-a6006ad3dba0' as UUID)); \ No newline at end of file +select toDateTimeOrDefault(cast(19 as Int8)); +select toDateTimeOrDefault(cast(19 as UInt8)); + +select toDateTimeOrDefault(cast(19 as Int16)); +select toDateTimeOrDefault(cast(19 as UInt16)); + +select toDateTimeOrDefault(cast(19 as Int32)); +select toDateTimeOrDefault(cast(19 as UInt32)); + +select toDateTimeOrDefault(cast(19 as Int64)); +select toDateTimeOrDefault(cast(19 as UInt64)); + +select toDateTimeOrDefault(cast(19 as Int128)); +select toDateTimeOrDefault(cast(19 as UInt128)); + +select toDateTimeOrDefault(cast(19 as Int256)); +select toDateTimeOrDefault(cast(19 as UInt256)); \ No newline at end of file From 74dc37cf610746a8814e3cc3195c44f2f3926650 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 31 May 2023 14:15:28 +0200 Subject: [PATCH 0181/1072] Fix assertion --- src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index e244f61ae2e..dc4a2599d1d 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -925,16 +925,16 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep() LOG_TEST( log, - "Read {} bytes, read type {}, position: {}, offset: {}, segment end: {}", - size, toString(read_type), implementation_buffer->getPosition(), - implementation_buffer->getFileOffsetOfBufferEnd(), file_segment.range().right); + "Read {} bytes, read type {}, file offset: {}, impl offset: {}/{}, segment: {}", + size, toString(read_type), file_offset_of_buffer_end, + implementation_buffer->getFileOffsetOfBufferEnd(), read_until_position, file_segment.range().toString()); if (read_type == ReadType::CACHED) { ProfileEvents::increment(ProfileEvents::CachedReadBufferReadFromCacheBytes, size); ProfileEvents::increment(ProfileEvents::CachedReadBufferReadFromCacheMicroseconds, elapsed); - chassert(file_offset_of_buffer_end + size <= file_segment.range().size()); + chassert(file_offset_of_buffer_end + size - 1 <= file_segment.range().right); } else { From 7d077f6130c61b529b13403b8a744220002f2ef0 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 31 May 2023 14:30:27 +0200 Subject: [PATCH 0182/1072] Add one more assertion --- src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index dc4a2599d1d..5cb9d3bbf6f 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -934,7 +934,9 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep() ProfileEvents::increment(ProfileEvents::CachedReadBufferReadFromCacheBytes, size); ProfileEvents::increment(ProfileEvents::CachedReadBufferReadFromCacheMicroseconds, elapsed); - chassert(file_offset_of_buffer_end + size - 1 <= file_segment.range().right); + [[maybe_unused]] size_t new_file_offset = file_offset_of_buffer_end + size; + chassert(new_file_offset - 1 <= file_segment.range().right); + chassert(new_file_offset <= file_segment.getCurrentWriteOffset(true)); } else { From a59effcc88314cfd8e4c7972c67d2dffde2114bb Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 31 May 2023 14:40:13 +0200 Subject: [PATCH 0183/1072] Minor improvements --- .../IO/CachedOnDiskReadBufferFromFile.cpp | 52 +++++++++---------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index 5cb9d3bbf6f..202914a0774 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -51,8 +51,8 @@ CachedOnDiskReadBufferFromFile::CachedOnDiskReadBufferFromFile( std::optional read_until_position_, std::shared_ptr cache_log_) : ReadBufferFromFileBase(use_external_buffer_ ? 0 : settings_.remote_fs_buffer_size, nullptr, 0, file_size_) -#ifndef NDEBUG - , log(&Poco::Logger::get("CachedOnDiskReadBufferFromFile(" + source_file_path_ + ")")) +#ifdef ABORT_ON_LOGICAL_ERROR + , log(&Poco::Logger::get(fmt::format("CachedOnDiskReadBufferFromFile({})", cache_key_))) #else , log(&Poco::Logger::get("CachedOnDiskReadBufferFromFile")) #endif @@ -75,6 +75,9 @@ CachedOnDiskReadBufferFromFile::CachedOnDiskReadBufferFromFile( void CachedOnDiskReadBufferFromFile::appendFilesystemCacheLog( const FileSegment::Range & file_segment_range, CachedOnDiskReadBufferFromFile::ReadType type) { + if (!cache_log) + return; + FilesystemCacheLogElement elem { .event_time = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()), @@ -104,8 +107,7 @@ void CachedOnDiskReadBufferFromFile::appendFilesystemCacheLog( break; } - if (cache_log) - cache_log->add(elem); + cache_log->add(elem); } void CachedOnDiskReadBufferFromFile::initialize(size_t offset, size_t size) @@ -411,7 +413,7 @@ CachedOnDiskReadBufferFromFile::getImplementationBuffer(FileSegment & file_segme { case ReadType::CACHED: { -#ifndef NDEBUG +#ifdef ABORT_ON_LOGICAL_ERROR size_t file_size = getFileSizeFromReadBuffer(*read_buffer_for_file_segment); if (file_size == 0 || range.left + file_size <= file_offset_of_buffer_end) throw Exception( @@ -456,7 +458,7 @@ CachedOnDiskReadBufferFromFile::getImplementationBuffer(FileSegment & file_segme { read_buffer_for_file_segment->seek(file_offset_of_buffer_end, SEEK_SET); - assert(read_buffer_for_file_segment->getFileOffsetOfBufferEnd() == file_offset_of_buffer_end); + chassert(read_buffer_for_file_segment->getFileOffsetOfBufferEnd() == file_offset_of_buffer_end); } const auto current_write_offset = file_segment.getCurrentWriteOffset(false); @@ -887,28 +889,24 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep() if (!result) { - auto debug_check = [&]() +#ifdef ABORT_ON_LOGICAL_ERROR + if (read_type == ReadType::CACHED) { - if (read_type == ReadType::CACHED) + size_t cache_file_size = getFileSizeFromReadBuffer(*implementation_buffer); + if (cache_file_size == 0) { - size_t cache_file_size = getFileSizeFromReadBuffer(*implementation_buffer); - if (cache_file_size == 0) - { - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "Attempt to read from an empty cache file: {} (just before actual read)", - cache_file_size); - } + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Attempt to read from an empty cache file: {} (just before actual read)", + cache_file_size); } - else - { - chassert(file_offset_of_buffer_end == static_cast(implementation_buffer->getFileOffsetOfBufferEnd())); - } - chassert(!implementation_buffer->hasPendingData()); - return true; - }; - - chassert(debug_check()); + } + else + { + chassert(file_offset_of_buffer_end == static_cast(implementation_buffer->getFileOffsetOfBufferEnd())); + } + chassert(!implementation_buffer->hasPendingData()); +#endif Stopwatch watch(CLOCK_MONOTONIC); @@ -1093,8 +1091,8 @@ off_t CachedOnDiskReadBufferFromFile::seek(off_t offset, int whence) if (file_offset_of_buffer_end - working_buffer.size() <= new_pos && new_pos <= file_offset_of_buffer_end) { pos = working_buffer.end() - file_offset_of_buffer_end + new_pos; - assert(pos >= working_buffer.begin()); - assert(pos <= working_buffer.end()); + chassert(pos >= working_buffer.begin()); + chassert(pos <= working_buffer.end()); return new_pos; } } From dbae50b6db6f885daef4d97f00d3ef81b6104741 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 31 May 2023 14:50:59 +0200 Subject: [PATCH 0184/1072] Better logging --- .../IO/CachedOnDiskReadBufferFromFile.cpp | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index 202914a0774..a60f5dffa96 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -365,8 +365,8 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegment & file_s else { LOG_TRACE( - log, - "Bypassing cache because file segment state is `PARTIALLY_DOWNLOADED_NO_CONTINUATION` and downloaded part already used"); + log, "Bypassing cache because file segment state is " + "`PARTIALLY_DOWNLOADED_NO_CONTINUATION` and downloaded part already used"); read_type = ReadType::REMOTE_FS_READ_BYPASS_CACHE; return getRemoteReadBuffer(file_segment, read_type); } @@ -466,8 +466,8 @@ CachedOnDiskReadBufferFromFile::getImplementationBuffer(FileSegment & file_segme { throw Exception( ErrorCodes::LOGICAL_ERROR, - "Buffer's offsets mismatch. Cached buffer offset: {}, current_write_offset: {}, implementation buffer position: {}, " - "implementation buffer end position: {}, file segment info: {}", + "Buffer's offsets mismatch. Cached buffer offset: {}, current_write_offset: {}, " + "implementation buffer position: {}, implementation buffer end position: {}, file segment info: {}", file_offset_of_buffer_end, current_write_offset, read_buffer_for_file_segment->getPosition(), @@ -932,9 +932,18 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep() ProfileEvents::increment(ProfileEvents::CachedReadBufferReadFromCacheBytes, size); ProfileEvents::increment(ProfileEvents::CachedReadBufferReadFromCacheMicroseconds, elapsed); - [[maybe_unused]] size_t new_file_offset = file_offset_of_buffer_end + size; +#ifdef ABORT_ON_LOGICAL_ERROR + const size_t new_file_offset = file_offset_of_buffer_end + size; chassert(new_file_offset - 1 <= file_segment.range().right); - chassert(new_file_offset <= file_segment.getCurrentWriteOffset(true)); + const size_t file_segment_write_offset = file_segment.getCurrentWriteOffset(true); + if (new_file_offset > file_segment_write_offset) + { + LOG_TRACE( + log, "Read {} bytes, file offset: {}, segment: {}, segment write offset: {}", + size, file_offset_of_buffer_end, file_segment.range().toString(), file_segment_write_offset); + chassert(false); + } +#endif } else { From a96ee7411b0fdd28d9a77d127f74848b889a73f6 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 31 May 2023 14:28:36 +0000 Subject: [PATCH 0185/1072] Bump From 3936c4dc52f24b477cfb5bf4821bb17626bd69df Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 31 May 2023 16:41:26 +0000 Subject: [PATCH 0186/1072] Try to fix fast tests (add timezone) --- .../01746_convert_type_with_default.sql | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.sql b/tests/queries/0_stateless/01746_convert_type_with_default.sql index 9fdd92491a7..75e1510f330 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.sql +++ b/tests/queries/0_stateless/01746_convert_type_with_default.sql @@ -58,23 +58,23 @@ select toDateOrDefault(-1, '2000-01-01'::Date); select toDateTimeOrDefault('2023-05-30 14:38:20'); select toDateTimeOrDefault('2023-05-30 14:38:20', 'UTC'); -select toString(toDateTimeOrDefault('1xxx', 'Asia/Istanbul', '2023-05-30 14:38:20'::DateTime('Asia/Istanbul')), 'Asia/Istanbul'); -select toDateTimeOrDefault(1685457500); +select toDateTimeOrDefault('1xxx', 'UTC', '2023-05-30 14:38:20'::DateTime('UTC')); +select toDateTimeOrDefault(1685457500, 'UTC'); -select toDateTimeOrDefault(cast(19 as Int8)); -select toDateTimeOrDefault(cast(19 as UInt8)); +select toDateTimeOrDefault(cast(19 as Int8), 'UTC'); +select toDateTimeOrDefault(cast(19 as UInt8), 'UTC'); -select toDateTimeOrDefault(cast(19 as Int16)); -select toDateTimeOrDefault(cast(19 as UInt16)); +select toDateTimeOrDefault(cast(19 as Int16), 'UTC'); +select toDateTimeOrDefault(cast(19 as UInt16), 'UTC'); -select toDateTimeOrDefault(cast(19 as Int32)); -select toDateTimeOrDefault(cast(19 as UInt32)); +select toDateTimeOrDefault(cast(19 as Int32), 'UTC'); +select toDateTimeOrDefault(cast(19 as UInt32), 'UTC'); -select toDateTimeOrDefault(cast(19 as Int64)); -select toDateTimeOrDefault(cast(19 as UInt64)); +select toDateTimeOrDefault(cast(19 as Int64), 'UTC'); +select toDateTimeOrDefault(cast(19 as UInt64), 'UTC'); -select toDateTimeOrDefault(cast(19 as Int128)); -select toDateTimeOrDefault(cast(19 as UInt128)); +select toDateTimeOrDefault(cast(19 as Int128), 'UTC'); +select toDateTimeOrDefault(cast(19 as UInt128), 'UTC'); -select toDateTimeOrDefault(cast(19 as Int256)); -select toDateTimeOrDefault(cast(19 as UInt256)); \ No newline at end of file +select toDateTimeOrDefault(cast(19 as Int256), 'UTC'); +select toDateTimeOrDefault(cast(19 as UInt256), 'UTC'); \ No newline at end of file From 8c1f579c481747c1257c53adf674493aae35b2c1 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 31 May 2023 16:41:30 +0000 Subject: [PATCH 0187/1072] Add google-protobuf submodule --- .gitmodules | 3 +++ contrib/google-protobuf | 1 + 2 files changed, 4 insertions(+) create mode 160000 contrib/google-protobuf diff --git a/.gitmodules b/.gitmodules index f0984fec4db..30777a42a9d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -344,3 +344,6 @@ [submodule "contrib/isa-l"] path = contrib/isa-l url = https://github.com/ClickHouse/isa-l.git +[submodule "contrib/google-protobuf"] + path = contrib/google-protobuf + url = https://github.com/ClickHouse/google-protobuf.git diff --git a/contrib/google-protobuf b/contrib/google-protobuf new file mode 160000 index 00000000000..315ffb5be89 --- /dev/null +++ b/contrib/google-protobuf @@ -0,0 +1 @@ +Subproject commit 315ffb5be89460f2857387d20aefc59b76b8bdc3 From 8c3e256caedbbf1dae3bd52cdddbd1b1a315e8ee Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 31 May 2023 16:42:56 +0000 Subject: [PATCH 0188/1072] Switch protobuf to v3.18.x --- contrib/google-protobuf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/google-protobuf b/contrib/google-protobuf index 315ffb5be89..3b3d8fe1913 160000 --- a/contrib/google-protobuf +++ b/contrib/google-protobuf @@ -1 +1 @@ -Subproject commit 315ffb5be89460f2857387d20aefc59b76b8bdc3 +Subproject commit 3b3d8fe191314ea903ea6b072f0e73ef18e15faa From 7d8c1ff3cca5a59749b839e7fe23dc1e3bd9cac8 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 31 May 2023 16:43:36 +0000 Subject: [PATCH 0189/1072] Move protobuf entry in .gitmodules --- .gitmodules | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitmodules b/.gitmodules index 30777a42a9d..d28f205b65c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -39,6 +39,9 @@ path = contrib/protobuf url = https://github.com/ClickHouse/protobuf branch = v3.13.0.1 +[submodule "contrib/google-protobuf"] + path = contrib/google-protobuf + url = https://github.com/ClickHouse/google-protobuf.git [submodule "contrib/boost"] path = contrib/boost url = https://github.com/ClickHouse/boost @@ -344,6 +347,3 @@ [submodule "contrib/isa-l"] path = contrib/isa-l url = https://github.com/ClickHouse/isa-l.git -[submodule "contrib/google-protobuf"] - path = contrib/google-protobuf - url = https://github.com/ClickHouse/google-protobuf.git From ab6fe946bd4554d08f70808ee4c13699e9862069 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 31 May 2023 16:47:26 +0000 Subject: [PATCH 0190/1072] Switch build to google-protobuf-cmake --- contrib/CMakeLists.txt | 4 +- contrib/google-protobuf-cmake/CMakeLists.txt | 329 ++++++++++++++++++ .../protobuf_generate.cmake | 198 +++++++++++ 3 files changed, 529 insertions(+), 2 deletions(-) create mode 100644 contrib/google-protobuf-cmake/CMakeLists.txt create mode 100644 contrib/google-protobuf-cmake/protobuf_generate.cmake diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 020fe1e1c5a..4a4ff9982ea 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -88,7 +88,7 @@ add_contrib (thrift-cmake thrift) # parquet/arrow/orc add_contrib (arrow-cmake arrow) # requires: snappy, thrift, double-conversion add_contrib (avro-cmake avro) # requires: snappy -add_contrib (protobuf-cmake protobuf) +add_contrib (google-protobuf-cmake google-protobuf) add_contrib (openldap-cmake openldap) add_contrib (grpc-cmake grpc) add_contrib (msgpack-c-cmake msgpack-c) @@ -156,7 +156,7 @@ add_contrib (libgsasl-cmake libgsasl) # requires krb5 add_contrib (librdkafka-cmake librdkafka) # requires: libgsasl add_contrib (nats-io-cmake nats-io) add_contrib (isa-l-cmake isa-l) -add_contrib (libhdfs3-cmake libhdfs3) # requires: protobuf, krb5, isa-l +add_contrib (libhdfs3-cmake libhdfs3) # requires: google-protobuf, krb5, isa-l add_contrib (hive-metastore-cmake hive-metastore) # requires: thrift/avro/arrow/libhdfs3 add_contrib (cppkafka-cmake cppkafka) add_contrib (libpqxx-cmake libpqxx) diff --git a/contrib/google-protobuf-cmake/CMakeLists.txt b/contrib/google-protobuf-cmake/CMakeLists.txt new file mode 100644 index 00000000000..e2d38acb51d --- /dev/null +++ b/contrib/google-protobuf-cmake/CMakeLists.txt @@ -0,0 +1,329 @@ +option(ENABLE_PROTOBUF "Enable protobuf" ${ENABLE_LIBRARIES}) + +if(NOT ENABLE_PROTOBUF) + message(STATUS "Not using protobuf") + return() +endif() + +set(Protobuf_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/google-protobuf/src") +if(OS_FREEBSD AND SANITIZE STREQUAL "address") + # ../contrib/protobuf/src/google/protobuf/arena_impl.h:45:10: fatal error: 'sanitizer/asan_interface.h' file not found + # #include + if(LLVM_INCLUDE_DIRS) + set(Protobuf_INCLUDE_DIR "${Protobuf_INCLUDE_DIR}" ${LLVM_INCLUDE_DIRS}) + else() + message(${RECONFIGURE_MESSAGE_LEVEL} "Can't use protobuf on FreeBSD with address sanitizer without LLVM") + return() + endif() +endif() + +set(protobuf_source_dir "${ClickHouse_SOURCE_DIR}/contrib/google-protobuf") +set(protobuf_binary_dir "${ClickHouse_BINARY_DIR}/contrib/google-protobuf") + + +add_definitions(-DGOOGLE_PROTOBUF_CMAKE_BUILD) + +add_definitions(-DHAVE_PTHREAD) +add_definitions(-DHAVE_ZLIB) + +include_directories( + ${protobuf_binary_dir} + ${protobuf_source_dir}/src) + +set(libprotobuf_lite_files + ${protobuf_source_dir}/src/google/protobuf/any_lite.cc + ${protobuf_source_dir}/src/google/protobuf/arena.cc + ${protobuf_source_dir}/src/google/protobuf/arenastring.cc + ${protobuf_source_dir}/src/google/protobuf/extension_set.cc + ${protobuf_source_dir}/src/google/protobuf/field_access_listener.cc + ${protobuf_source_dir}/src/google/protobuf/generated_enum_util.cc + ${protobuf_source_dir}/src/google/protobuf/generated_message_table_driven_lite.cc + ${protobuf_source_dir}/src/google/protobuf/generated_message_util.cc + ${protobuf_source_dir}/src/google/protobuf/implicit_weak_message.cc + ${protobuf_source_dir}/src/google/protobuf/io/coded_stream.cc + ${protobuf_source_dir}/src/google/protobuf/io/io_win32.cc + ${protobuf_source_dir}/src/google/protobuf/io/strtod.cc + ${protobuf_source_dir}/src/google/protobuf/io/zero_copy_stream.cc + ${protobuf_source_dir}/src/google/protobuf/io/zero_copy_stream_impl.cc + ${protobuf_source_dir}/src/google/protobuf/io/zero_copy_stream_impl_lite.cc + ${protobuf_source_dir}/src/google/protobuf/map.cc + ${protobuf_source_dir}/src/google/protobuf/message_lite.cc + ${protobuf_source_dir}/src/google/protobuf/parse_context.cc + ${protobuf_source_dir}/src/google/protobuf/repeated_field.cc + ${protobuf_source_dir}/src/google/protobuf/stubs/bytestream.cc + ${protobuf_source_dir}/src/google/protobuf/stubs/common.cc + ${protobuf_source_dir}/src/google/protobuf/stubs/int128.cc + ${protobuf_source_dir}/src/google/protobuf/stubs/status.cc + ${protobuf_source_dir}/src/google/protobuf/stubs/statusor.cc + ${protobuf_source_dir}/src/google/protobuf/stubs/stringpiece.cc + ${protobuf_source_dir}/src/google/protobuf/stubs/stringprintf.cc + ${protobuf_source_dir}/src/google/protobuf/stubs/structurally_valid.cc + ${protobuf_source_dir}/src/google/protobuf/stubs/strutil.cc + ${protobuf_source_dir}/src/google/protobuf/stubs/time.cc + ${protobuf_source_dir}/src/google/protobuf/wire_format_lite.cc +) + +add_library(_libprotobuf-lite ${libprotobuf_lite_files}) +target_link_libraries(_libprotobuf-lite pthread) +if(${CMAKE_SYSTEM_NAME} STREQUAL "Android") + target_link_libraries(_libprotobuf-lite log) +endif() +target_include_directories(_libprotobuf-lite SYSTEM PUBLIC ${protobuf_source_dir}/src) +add_library(protobuf::libprotobuf-lite ALIAS _libprotobuf-lite) + + +set(libprotobuf_files + ${protobuf_source_dir}/src/google/protobuf/any.cc + ${protobuf_source_dir}/src/google/protobuf/any.pb.cc + ${protobuf_source_dir}/src/google/protobuf/api.pb.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/importer.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/parser.cc + ${protobuf_source_dir}/src/google/protobuf/descriptor.cc + ${protobuf_source_dir}/src/google/protobuf/descriptor.pb.cc + ${protobuf_source_dir}/src/google/protobuf/descriptor_database.cc + ${protobuf_source_dir}/src/google/protobuf/duration.pb.cc + ${protobuf_source_dir}/src/google/protobuf/dynamic_message.cc + ${protobuf_source_dir}/src/google/protobuf/empty.pb.cc + ${protobuf_source_dir}/src/google/protobuf/extension_set_heavy.cc + ${protobuf_source_dir}/src/google/protobuf/field_mask.pb.cc + ${protobuf_source_dir}/src/google/protobuf/generated_message_reflection.cc + ${protobuf_source_dir}/src/google/protobuf/generated_message_table_driven.cc + ${protobuf_source_dir}/src/google/protobuf/io/gzip_stream.cc + ${protobuf_source_dir}/src/google/protobuf/io/printer.cc + ${protobuf_source_dir}/src/google/protobuf/io/tokenizer.cc + ${protobuf_source_dir}/src/google/protobuf/map_field.cc + ${protobuf_source_dir}/src/google/protobuf/message.cc + ${protobuf_source_dir}/src/google/protobuf/reflection_ops.cc + ${protobuf_source_dir}/src/google/protobuf/service.cc + ${protobuf_source_dir}/src/google/protobuf/source_context.pb.cc + ${protobuf_source_dir}/src/google/protobuf/struct.pb.cc + ${protobuf_source_dir}/src/google/protobuf/stubs/substitute.cc + ${protobuf_source_dir}/src/google/protobuf/text_format.cc + ${protobuf_source_dir}/src/google/protobuf/timestamp.pb.cc + ${protobuf_source_dir}/src/google/protobuf/type.pb.cc + ${protobuf_source_dir}/src/google/protobuf/unknown_field_set.cc + ${protobuf_source_dir}/src/google/protobuf/util/delimited_message_util.cc + ${protobuf_source_dir}/src/google/protobuf/util/field_comparator.cc + ${protobuf_source_dir}/src/google/protobuf/util/field_mask_util.cc + ${protobuf_source_dir}/src/google/protobuf/util/internal/datapiece.cc + ${protobuf_source_dir}/src/google/protobuf/util/internal/default_value_objectwriter.cc + ${protobuf_source_dir}/src/google/protobuf/util/internal/error_listener.cc + ${protobuf_source_dir}/src/google/protobuf/util/internal/field_mask_utility.cc + ${protobuf_source_dir}/src/google/protobuf/util/internal/json_escaping.cc + ${protobuf_source_dir}/src/google/protobuf/util/internal/json_objectwriter.cc + ${protobuf_source_dir}/src/google/protobuf/util/internal/json_stream_parser.cc + ${protobuf_source_dir}/src/google/protobuf/util/internal/object_writer.cc + ${protobuf_source_dir}/src/google/protobuf/util/internal/proto_writer.cc + ${protobuf_source_dir}/src/google/protobuf/util/internal/protostream_objectsource.cc + ${protobuf_source_dir}/src/google/protobuf/util/internal/protostream_objectwriter.cc + ${protobuf_source_dir}/src/google/protobuf/util/internal/type_info.cc + ${protobuf_source_dir}/src/google/protobuf/util/internal/type_info_test_helper.cc + ${protobuf_source_dir}/src/google/protobuf/util/internal/utility.cc + ${protobuf_source_dir}/src/google/protobuf/util/json_util.cc + ${protobuf_source_dir}/src/google/protobuf/util/message_differencer.cc + ${protobuf_source_dir}/src/google/protobuf/util/time_util.cc + ${protobuf_source_dir}/src/google/protobuf/util/type_resolver_util.cc + ${protobuf_source_dir}/src/google/protobuf/wire_format.cc + ${protobuf_source_dir}/src/google/protobuf/wrappers.pb.cc +) + +add_library(_libprotobuf ${libprotobuf_lite_files} ${libprotobuf_files}) +if (ENABLE_FUZZING) + target_compile_options(_libprotobuf PRIVATE "-fsanitize-recover=all") +endif() +target_link_libraries(_libprotobuf pthread) +target_link_libraries(_libprotobuf ch_contrib::zlib) +if(${CMAKE_SYSTEM_NAME} STREQUAL "Android") + target_link_libraries(_libprotobuf log) +endif() +target_include_directories(_libprotobuf SYSTEM PUBLIC ${protobuf_source_dir}/src) +add_library(protobuf::libprotobuf ALIAS _libprotobuf) + + +set(libprotoc_files + ${protobuf_source_dir}/src/google/protobuf/compiler/code_generator.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/command_line_interface.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_enum.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_enum_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_extension.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_file.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_generator.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_helpers.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_map_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_message.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_message_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_padding_optimizer.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_parse_function_generator.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_primitive_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_service.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_string_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_doc_comment.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_enum.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_enum_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_field_base.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_generator.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_helpers.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_map_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_message.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_message_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_primitive_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_reflection_class.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_repeated_enum_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_repeated_message_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_repeated_primitive_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_source_generator_base.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_wrapper_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_context.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_doc_comment.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_enum.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_enum_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_enum_field_lite.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_enum_lite.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_extension.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_extension_lite.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_file.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_generator.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_generator_factory.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_helpers.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_kotlin_generator.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_map_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_map_field_lite.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_message.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_message_builder.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_message_builder_lite.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_message_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_message_field_lite.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_message_lite.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_name_resolver.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_primitive_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_primitive_field_lite.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_service.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_shared_code_generator.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_string_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_string_field_lite.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/js/js_generator.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/js/well_known_types_embed.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_enum.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_enum_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_extension.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_file.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_generator.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_helpers.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_map_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_message.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_message_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_oneof.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_primitive_field.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/php/php_generator.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/plugin.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/plugin.pb.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/python/python_generator.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/ruby/ruby_generator.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/subprocess.cc + ${protobuf_source_dir}/src/google/protobuf/compiler/zip_writer.cc +) + +add_library(_libprotoc ${libprotoc_files}) +target_link_libraries(_libprotoc _libprotobuf) +add_library(protobuf::libprotoc ALIAS _libprotoc) + +set(protoc_files ${protobuf_source_dir}/src/google/protobuf/compiler/main.cc) + +if (CMAKE_HOST_SYSTEM_NAME STREQUAL CMAKE_SYSTEM_NAME + AND CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL CMAKE_SYSTEM_PROCESSOR) + + add_executable(protoc ${protoc_files}) + target_link_libraries(protoc _libprotoc _libprotobuf pthread) + add_executable(protobuf::protoc ALIAS protoc) + + if (ENABLE_FUZZING) + # `protoc` will be built with sanitizer and it could fail during ClickHouse build + # It easily reproduces in oss-fuzz building pipeline + # To avoid this we can try to build `protoc` without any sanitizer with option `-fno-sanitize=all`, but + # it this case we will face with linker errors, because libcxx still will be built with sanitizer + # So, we can simply suppress all of these failures with a combination this flag and an environment variable + # export MSAN_OPTIONS=exit_code=0 + target_compile_options(protoc PRIVATE "-fsanitize-recover=all") + endif() +else () + # Build 'protoc' for host arch + set (PROTOC_BUILD_DIR "${protobuf_binary_dir}/build") + + if (NOT EXISTS "${PROTOC_BUILD_DIR}/protoc") + + # This is quite ugly but I cannot make dependencies work propery. + + execute_process( + COMMAND mkdir -p ${PROTOC_BUILD_DIR} + COMMAND_ECHO STDOUT) + + execute_process( + COMMAND ${CMAKE_COMMAND} + "-G${CMAKE_GENERATOR}" + "-DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}" + "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + "-Dprotobuf_BUILD_TESTS=0" + "-Dprotobuf_BUILD_CONFORMANCE=0" + "-Dprotobuf_BUILD_EXAMPLES=0" + "-Dprotobuf_BUILD_PROTOC_BINARIES=1" + "${protobuf_source_dir}/cmake" + WORKING_DIRECTORY "${PROTOC_BUILD_DIR}" + COMMAND_ECHO STDOUT) + + execute_process( + COMMAND ${CMAKE_COMMAND} --build "${PROTOC_BUILD_DIR}" + COMMAND_ECHO STDOUT) + endif () + +# add_custom_command ( +# OUTPUT ${PROTOC_BUILD_DIR} +# COMMAND mkdir -p ${PROTOC_BUILD_DIR}) +# +# add_custom_command ( +# OUTPUT "${PROTOC_BUILD_DIR}/CMakeCache.txt" +# +# COMMAND ${CMAKE_COMMAND} +# -G"${CMAKE_GENERATOR}" +# -DCMAKE_MAKE_PROGRAM="${CMAKE_MAKE_PROGRAM}" +# -DCMAKE_C_COMPILER="${CMAKE_C_COMPILER}" +# -DCMAKE_CXX_COMPILER="${CMAKE_CXX_COMPILER}" +# -Dprotobuf_BUILD_TESTS=0 +# -Dprotobuf_BUILD_CONFORMANCE=0 +# -Dprotobuf_BUILD_EXAMPLES=0 +# -Dprotobuf_BUILD_PROTOC_BINARIES=1 +# "${protobuf_source_dir}/cmake" +# +# DEPENDS "${PROTOC_BUILD_DIR}" +# WORKING_DIRECTORY "${PROTOC_BUILD_DIR}" +# COMMENT "Configuring 'protoc' for host architecture." +# USES_TERMINAL) +# +# add_custom_command ( +# OUTPUT "${PROTOC_BUILD_DIR}/protoc" +# COMMAND ${CMAKE_COMMAND} --build "${PROTOC_BUILD_DIR}" +# DEPENDS "${PROTOC_BUILD_DIR}/CMakeCache.txt" +# COMMENT "Building 'protoc' for host architecture." +# USES_TERMINAL) +# +# add_custom_target (protoc-host DEPENDS "${PROTOC_BUILD_DIR}/protoc") + + add_executable(protoc IMPORTED GLOBAL) + set_target_properties (protoc PROPERTIES IMPORTED_LOCATION "${PROTOC_BUILD_DIR}/protoc") + add_dependencies(protoc "${PROTOC_BUILD_DIR}/protoc") +endif () + +include("${ClickHouse_SOURCE_DIR}/contrib/google-protobuf-cmake/protobuf_generate.cmake") + +add_library(_protobuf INTERFACE) +target_link_libraries(_protobuf INTERFACE _libprotobuf) +target_include_directories(_protobuf INTERFACE "${Protobuf_INCLUDE_DIR}") +add_library(ch_contrib::protobuf ALIAS _protobuf) + +add_library(_protoc INTERFACE) +target_link_libraries(_protoc INTERFACE _libprotoc _libprotobuf) +target_include_directories(_protoc INTERFACE "${Protobuf_INCLUDE_DIR}") +add_library(ch_contrib::protoc ALIAS _protoc) diff --git a/contrib/google-protobuf-cmake/protobuf_generate.cmake b/contrib/google-protobuf-cmake/protobuf_generate.cmake new file mode 100644 index 00000000000..3e30b4e40fd --- /dev/null +++ b/contrib/google-protobuf-cmake/protobuf_generate.cmake @@ -0,0 +1,198 @@ +# The code in this file was copied from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake + +#[[ +Add custom commands to process ``.proto`` files to C++:: + +protobuf_generate_cpp ( + [DESCRIPTORS ] [EXPORT_MACRO ] [...]) + +``SRCS`` + Variable to define with autogenerated source files +``HDRS`` + Variable to define with autogenerated header files +``DESCRIPTORS`` + Variable to define with autogenerated descriptor files, if requested. +``EXPORT_MACRO`` + is a macro which should expand to ``__declspec(dllexport)`` or + ``__declspec(dllimport)`` depending on what is being compiled. +``ARGN`` + ``.proto`` files +#]] + +function(PROTOBUF_GENERATE_CPP SRCS HDRS) + cmake_parse_arguments(protobuf_generate_cpp "" "EXPORT_MACRO;DESCRIPTORS" "" ${ARGN}) + + set(_proto_files "${protobuf_generate_cpp_UNPARSED_ARGUMENTS}") + if(NOT _proto_files) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP() called without any proto files") + return() + endif() + + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + set(_append_arg APPEND_PATH) + endif() + + if(protobuf_generate_cpp_DESCRIPTORS) + set(_descriptors DESCRIPTORS) + endif() + + if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) + set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") + endif() + + if(DEFINED Protobuf_IMPORT_DIRS) + set(_import_arg IMPORT_DIRS ${Protobuf_IMPORT_DIRS}) + endif() + + set(_outvar) + protobuf_generate(${_append_arg} ${_descriptors} LANGUAGE cpp EXPORT_MACRO ${protobuf_generate_cpp_EXPORT_MACRO} OUT_VAR _outvar ${_import_arg} PROTOS ${_proto_files}) + + set(${SRCS}) + set(${HDRS}) + if(protobuf_generate_cpp_DESCRIPTORS) + set(${protobuf_generate_cpp_DESCRIPTORS}) + endif() + + foreach(_file ${_outvar}) + if(_file MATCHES "cc$") + list(APPEND ${SRCS} ${_file}) + elseif(_file MATCHES "desc$") + list(APPEND ${protobuf_generate_cpp_DESCRIPTORS} ${_file}) + else() + list(APPEND ${HDRS} ${_file}) + endif() + endforeach() + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + set(${HDRS} ${${HDRS}} PARENT_SCOPE) + if(protobuf_generate_cpp_DESCRIPTORS) + set(${protobuf_generate_cpp_DESCRIPTORS} "${${protobuf_generate_cpp_DESCRIPTORS}}" PARENT_SCOPE) + endif() +endfunction() + +# By default have PROTOBUF_GENERATE_CPP macro pass -I to protoc +# for each directory where a proto file is referenced. +if(NOT DEFINED PROTOBUF_GENERATE_CPP_APPEND_PATH) + set(PROTOBUF_GENERATE_CPP_APPEND_PATH TRUE) +endif() + +function(protobuf_generate) + set(_options APPEND_PATH DESCRIPTORS) + set(_singleargs LANGUAGE OUT_VAR EXPORT_MACRO PROTOC_OUT_DIR) + if(COMMAND target_sources) + list(APPEND _singleargs TARGET) + endif() + set(_multiargs PROTOS IMPORT_DIRS GENERATE_EXTENSIONS) + + cmake_parse_arguments(protobuf_generate "${_options}" "${_singleargs}" "${_multiargs}" "${ARGN}") + + if(NOT protobuf_generate_PROTOS AND NOT protobuf_generate_TARGET) + message(SEND_ERROR "Error: protobuf_generate called without any targets or source files") + return() + endif() + + if(NOT protobuf_generate_OUT_VAR AND NOT protobuf_generate_TARGET) + message(SEND_ERROR "Error: protobuf_generate called without a target or output variable") + return() + endif() + + if(NOT protobuf_generate_LANGUAGE) + set(protobuf_generate_LANGUAGE cpp) + endif() + string(TOLOWER ${protobuf_generate_LANGUAGE} protobuf_generate_LANGUAGE) + + if(NOT protobuf_generate_PROTOC_OUT_DIR) + set(protobuf_generate_PROTOC_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}) + endif() + + if(protobuf_generate_EXPORT_MACRO AND protobuf_generate_LANGUAGE STREQUAL cpp) + set(_dll_export_decl "dllexport_decl=${protobuf_generate_EXPORT_MACRO}:") + endif() + + if(NOT protobuf_generate_GENERATE_EXTENSIONS) + if(protobuf_generate_LANGUAGE STREQUAL cpp) + set(protobuf_generate_GENERATE_EXTENSIONS .pb.h .pb.cc) + elseif(protobuf_generate_LANGUAGE STREQUAL python) + set(protobuf_generate_GENERATE_EXTENSIONS _pb2.py) + else() + message(SEND_ERROR "Error: protobuf_generate given unknown Language ${LANGUAGE}, please provide a value for GENERATE_EXTENSIONS") + return() + endif() + endif() + + if(protobuf_generate_TARGET) + get_target_property(_source_list ${protobuf_generate_TARGET} SOURCES) + foreach(_file ${_source_list}) + if(_file MATCHES "proto$") + list(APPEND protobuf_generate_PROTOS ${_file}) + endif() + endforeach() + endif() + + if(NOT protobuf_generate_PROTOS) + message(SEND_ERROR "Error: protobuf_generate could not find any .proto files") + return() + endif() + + if(protobuf_generate_APPEND_PATH) + # Create an include path for each file specified + foreach(_file ${protobuf_generate_PROTOS}) + get_filename_component(_abs_file ${_file} ABSOLUTE) + get_filename_component(_abs_path ${_abs_file} PATH) + list(FIND _protobuf_include_path ${_abs_path} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${_abs_path}) + endif() + endforeach() + else() + set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + + foreach(DIR ${protobuf_generate_IMPORT_DIRS}) + get_filename_component(ABS_PATH ${DIR} ABSOLUTE) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + + set(_generated_srcs_all) + foreach(_proto ${protobuf_generate_PROTOS}) + get_filename_component(_abs_file ${_proto} ABSOLUTE) + get_filename_component(_abs_dir ${_abs_file} DIRECTORY) + get_filename_component(_basename ${_proto} NAME_WE) + file(RELATIVE_PATH _rel_dir ${CMAKE_CURRENT_SOURCE_DIR} ${_abs_dir}) + + set(_possible_rel_dir) + if (NOT protobuf_generate_APPEND_PATH) + set(_possible_rel_dir ${_rel_dir}/) + endif() + + set(_generated_srcs) + foreach(_ext ${protobuf_generate_GENERATE_EXTENSIONS}) + list(APPEND _generated_srcs "${protobuf_generate_PROTOC_OUT_DIR}/${_possible_rel_dir}${_basename}${_ext}") + endforeach() + + if(protobuf_generate_DESCRIPTORS AND protobuf_generate_LANGUAGE STREQUAL cpp) + set(_descriptor_file "${CMAKE_CURRENT_BINARY_DIR}/${_basename}.desc") + set(_dll_desc_out "--descriptor_set_out=${_descriptor_file}") + list(APPEND _generated_srcs ${_descriptor_file}) + endif() + list(APPEND _generated_srcs_all ${_generated_srcs}) + + add_custom_command( + OUTPUT ${_generated_srcs} + COMMAND $ + ARGS --${protobuf_generate_LANGUAGE}_out ${_dll_export_decl}${protobuf_generate_PROTOC_OUT_DIR} ${_dll_desc_out} ${_protobuf_include_path} ${_abs_file} + DEPENDS ${_abs_file} protoc + COMMENT "Running ${protobuf_generate_LANGUAGE} protocol buffer compiler on ${_proto}" + VERBATIM) + endforeach() + + set_source_files_properties(${_generated_srcs_all} PROPERTIES GENERATED TRUE) + if(protobuf_generate_OUT_VAR) + set(${protobuf_generate_OUT_VAR} ${_generated_srcs_all} PARENT_SCOPE) + endif() + if(protobuf_generate_TARGET) + target_sources(${protobuf_generate_TARGET} PRIVATE ${_generated_srcs_all}) + endif() +endfunction() From 495580918a285ff46ddb3fb91f3b66885b6e2138 Mon Sep 17 00:00:00 2001 From: alekar Date: Wed, 31 May 2023 10:00:19 -0700 Subject: [PATCH 0191/1072] Update base/base/getMemoryAmount.cpp Co-authored-by: Sergei Trifonov --- base/base/getMemoryAmount.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/base/getMemoryAmount.cpp b/base/base/getMemoryAmount.cpp index 6a5470a0549..8a2fcd092d2 100644 --- a/base/base/getMemoryAmount.cpp +++ b/base/base/getMemoryAmount.cpp @@ -29,7 +29,7 @@ uint64_t getMemoryAmountOrZero() #if defined(OS_LINUX) // Try to lookup at the Cgroup limit - // v2 + // CGroups v2 std::ifstream cgroupv2_limit("/sys/fs/cgroup/memory.max"); if (cgroupv2_limit.is_open()) { From cb85e5a01ea1e823229c312279c67b12a5d6d3db Mon Sep 17 00:00:00 2001 From: alekar Date: Wed, 31 May 2023 10:00:43 -0700 Subject: [PATCH 0192/1072] Update base/base/getMemoryAmount.cpp Co-authored-by: Sergei Trifonov --- base/base/getMemoryAmount.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/base/getMemoryAmount.cpp b/base/base/getMemoryAmount.cpp index 8a2fcd092d2..a46e964c5a3 100644 --- a/base/base/getMemoryAmount.cpp +++ b/base/base/getMemoryAmount.cpp @@ -40,7 +40,7 @@ uint64_t getMemoryAmountOrZero() } else { - // v1 + // CGroups v1 std::ifstream cgroup_limit("/sys/fs/cgroup/memory/memory.limit_in_bytes"); if (cgroup_limit.is_open()) { From 57c88e664c856d44eb144416947d30ef19f5d073 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 31 May 2023 17:01:00 +0000 Subject: [PATCH 0193/1072] Remove old protobuf --- .gitmodules | 4 - contrib/protobuf | 1 - contrib/protobuf-cmake/CMakeLists.txt | 329 ------------------ .../protobuf-cmake/protobuf_generate.cmake | 198 ----------- 4 files changed, 532 deletions(-) delete mode 160000 contrib/protobuf delete mode 100644 contrib/protobuf-cmake/CMakeLists.txt delete mode 100644 contrib/protobuf-cmake/protobuf_generate.cmake diff --git a/.gitmodules b/.gitmodules index d28f205b65c..e28d8257465 100644 --- a/.gitmodules +++ b/.gitmodules @@ -35,10 +35,6 @@ [submodule "contrib/unixodbc"] path = contrib/unixodbc url = https://github.com/ClickHouse/UnixODBC -[submodule "contrib/protobuf"] - path = contrib/protobuf - url = https://github.com/ClickHouse/protobuf - branch = v3.13.0.1 [submodule "contrib/google-protobuf"] path = contrib/google-protobuf url = https://github.com/ClickHouse/google-protobuf.git diff --git a/contrib/protobuf b/contrib/protobuf deleted file mode 160000 index 6bb70196c53..00000000000 --- a/contrib/protobuf +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 6bb70196c5360268d9f021bb7936fb0b551724c2 diff --git a/contrib/protobuf-cmake/CMakeLists.txt b/contrib/protobuf-cmake/CMakeLists.txt deleted file mode 100644 index 5e22136fc1f..00000000000 --- a/contrib/protobuf-cmake/CMakeLists.txt +++ /dev/null @@ -1,329 +0,0 @@ -option(ENABLE_PROTOBUF "Enable protobuf" ${ENABLE_LIBRARIES}) - -if(NOT ENABLE_PROTOBUF) - message(STATUS "Not using protobuf") - return() -endif() - -set(Protobuf_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/protobuf/src") -if(OS_FREEBSD AND SANITIZE STREQUAL "address") - # ../contrib/protobuf/src/google/protobuf/arena_impl.h:45:10: fatal error: 'sanitizer/asan_interface.h' file not found - # #include - if(LLVM_INCLUDE_DIRS) - set(Protobuf_INCLUDE_DIR "${Protobuf_INCLUDE_DIR}" ${LLVM_INCLUDE_DIRS}) - else() - message(${RECONFIGURE_MESSAGE_LEVEL} "Can't use protobuf on FreeBSD with address sanitizer without LLVM") - return() - endif() -endif() - -set(protobuf_source_dir "${ClickHouse_SOURCE_DIR}/contrib/protobuf") -set(protobuf_binary_dir "${ClickHouse_BINARY_DIR}/contrib/protobuf") - - -add_definitions(-DGOOGLE_PROTOBUF_CMAKE_BUILD) - -add_definitions(-DHAVE_PTHREAD) -add_definitions(-DHAVE_ZLIB) - -include_directories( - ${protobuf_binary_dir} - ${protobuf_source_dir}/src) - -set(libprotobuf_lite_files - ${protobuf_source_dir}/src/google/protobuf/any_lite.cc - ${protobuf_source_dir}/src/google/protobuf/arena.cc - ${protobuf_source_dir}/src/google/protobuf/arenastring.cc - ${protobuf_source_dir}/src/google/protobuf/extension_set.cc - ${protobuf_source_dir}/src/google/protobuf/field_access_listener.cc - ${protobuf_source_dir}/src/google/protobuf/generated_enum_util.cc - ${protobuf_source_dir}/src/google/protobuf/generated_message_table_driven_lite.cc - ${protobuf_source_dir}/src/google/protobuf/generated_message_util.cc - ${protobuf_source_dir}/src/google/protobuf/implicit_weak_message.cc - ${protobuf_source_dir}/src/google/protobuf/io/coded_stream.cc - ${protobuf_source_dir}/src/google/protobuf/io/io_win32.cc - ${protobuf_source_dir}/src/google/protobuf/io/strtod.cc - ${protobuf_source_dir}/src/google/protobuf/io/zero_copy_stream.cc - ${protobuf_source_dir}/src/google/protobuf/io/zero_copy_stream_impl.cc - ${protobuf_source_dir}/src/google/protobuf/io/zero_copy_stream_impl_lite.cc - ${protobuf_source_dir}/src/google/protobuf/map.cc - ${protobuf_source_dir}/src/google/protobuf/message_lite.cc - ${protobuf_source_dir}/src/google/protobuf/parse_context.cc - ${protobuf_source_dir}/src/google/protobuf/repeated_field.cc - ${protobuf_source_dir}/src/google/protobuf/stubs/bytestream.cc - ${protobuf_source_dir}/src/google/protobuf/stubs/common.cc - ${protobuf_source_dir}/src/google/protobuf/stubs/int128.cc - ${protobuf_source_dir}/src/google/protobuf/stubs/status.cc - ${protobuf_source_dir}/src/google/protobuf/stubs/statusor.cc - ${protobuf_source_dir}/src/google/protobuf/stubs/stringpiece.cc - ${protobuf_source_dir}/src/google/protobuf/stubs/stringprintf.cc - ${protobuf_source_dir}/src/google/protobuf/stubs/structurally_valid.cc - ${protobuf_source_dir}/src/google/protobuf/stubs/strutil.cc - ${protobuf_source_dir}/src/google/protobuf/stubs/time.cc - ${protobuf_source_dir}/src/google/protobuf/wire_format_lite.cc -) - -add_library(_libprotobuf-lite ${libprotobuf_lite_files}) -target_link_libraries(_libprotobuf-lite pthread) -if(${CMAKE_SYSTEM_NAME} STREQUAL "Android") - target_link_libraries(_libprotobuf-lite log) -endif() -target_include_directories(_libprotobuf-lite SYSTEM PUBLIC ${protobuf_source_dir}/src) -add_library(protobuf::libprotobuf-lite ALIAS _libprotobuf-lite) - - -set(libprotobuf_files - ${protobuf_source_dir}/src/google/protobuf/any.cc - ${protobuf_source_dir}/src/google/protobuf/any.pb.cc - ${protobuf_source_dir}/src/google/protobuf/api.pb.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/importer.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/parser.cc - ${protobuf_source_dir}/src/google/protobuf/descriptor.cc - ${protobuf_source_dir}/src/google/protobuf/descriptor.pb.cc - ${protobuf_source_dir}/src/google/protobuf/descriptor_database.cc - ${protobuf_source_dir}/src/google/protobuf/duration.pb.cc - ${protobuf_source_dir}/src/google/protobuf/dynamic_message.cc - ${protobuf_source_dir}/src/google/protobuf/empty.pb.cc - ${protobuf_source_dir}/src/google/protobuf/extension_set_heavy.cc - ${protobuf_source_dir}/src/google/protobuf/field_mask.pb.cc - ${protobuf_source_dir}/src/google/protobuf/generated_message_reflection.cc - ${protobuf_source_dir}/src/google/protobuf/generated_message_table_driven.cc - ${protobuf_source_dir}/src/google/protobuf/io/gzip_stream.cc - ${protobuf_source_dir}/src/google/protobuf/io/printer.cc - ${protobuf_source_dir}/src/google/protobuf/io/tokenizer.cc - ${protobuf_source_dir}/src/google/protobuf/map_field.cc - ${protobuf_source_dir}/src/google/protobuf/message.cc - ${protobuf_source_dir}/src/google/protobuf/reflection_ops.cc - ${protobuf_source_dir}/src/google/protobuf/service.cc - ${protobuf_source_dir}/src/google/protobuf/source_context.pb.cc - ${protobuf_source_dir}/src/google/protobuf/struct.pb.cc - ${protobuf_source_dir}/src/google/protobuf/stubs/substitute.cc - ${protobuf_source_dir}/src/google/protobuf/text_format.cc - ${protobuf_source_dir}/src/google/protobuf/timestamp.pb.cc - ${protobuf_source_dir}/src/google/protobuf/type.pb.cc - ${protobuf_source_dir}/src/google/protobuf/unknown_field_set.cc - ${protobuf_source_dir}/src/google/protobuf/util/delimited_message_util.cc - ${protobuf_source_dir}/src/google/protobuf/util/field_comparator.cc - ${protobuf_source_dir}/src/google/protobuf/util/field_mask_util.cc - ${protobuf_source_dir}/src/google/protobuf/util/internal/datapiece.cc - ${protobuf_source_dir}/src/google/protobuf/util/internal/default_value_objectwriter.cc - ${protobuf_source_dir}/src/google/protobuf/util/internal/error_listener.cc - ${protobuf_source_dir}/src/google/protobuf/util/internal/field_mask_utility.cc - ${protobuf_source_dir}/src/google/protobuf/util/internal/json_escaping.cc - ${protobuf_source_dir}/src/google/protobuf/util/internal/json_objectwriter.cc - ${protobuf_source_dir}/src/google/protobuf/util/internal/json_stream_parser.cc - ${protobuf_source_dir}/src/google/protobuf/util/internal/object_writer.cc - ${protobuf_source_dir}/src/google/protobuf/util/internal/proto_writer.cc - ${protobuf_source_dir}/src/google/protobuf/util/internal/protostream_objectsource.cc - ${protobuf_source_dir}/src/google/protobuf/util/internal/protostream_objectwriter.cc - ${protobuf_source_dir}/src/google/protobuf/util/internal/type_info.cc - ${protobuf_source_dir}/src/google/protobuf/util/internal/type_info_test_helper.cc - ${protobuf_source_dir}/src/google/protobuf/util/internal/utility.cc - ${protobuf_source_dir}/src/google/protobuf/util/json_util.cc - ${protobuf_source_dir}/src/google/protobuf/util/message_differencer.cc - ${protobuf_source_dir}/src/google/protobuf/util/time_util.cc - ${protobuf_source_dir}/src/google/protobuf/util/type_resolver_util.cc - ${protobuf_source_dir}/src/google/protobuf/wire_format.cc - ${protobuf_source_dir}/src/google/protobuf/wrappers.pb.cc -) - -add_library(_libprotobuf ${libprotobuf_lite_files} ${libprotobuf_files}) -if (ENABLE_FUZZING) - target_compile_options(_libprotobuf PRIVATE "-fsanitize-recover=all") -endif() -target_link_libraries(_libprotobuf pthread) -target_link_libraries(_libprotobuf ch_contrib::zlib) -if(${CMAKE_SYSTEM_NAME} STREQUAL "Android") - target_link_libraries(_libprotobuf log) -endif() -target_include_directories(_libprotobuf SYSTEM PUBLIC ${protobuf_source_dir}/src) -add_library(protobuf::libprotobuf ALIAS _libprotobuf) - - -set(libprotoc_files - ${protobuf_source_dir}/src/google/protobuf/compiler/code_generator.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/command_line_interface.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_enum.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_enum_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_extension.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_file.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_generator.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_helpers.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_map_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_message.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_message_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_padding_optimizer.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_parse_function_generator.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_primitive_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_service.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/cpp/cpp_string_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_doc_comment.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_enum.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_enum_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_field_base.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_generator.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_helpers.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_map_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_message.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_message_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_primitive_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_reflection_class.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_repeated_enum_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_repeated_message_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_repeated_primitive_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_source_generator_base.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/csharp/csharp_wrapper_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_context.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_doc_comment.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_enum.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_enum_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_enum_field_lite.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_enum_lite.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_extension.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_extension_lite.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_file.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_generator.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_generator_factory.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_helpers.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_kotlin_generator.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_map_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_map_field_lite.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_message.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_message_builder.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_message_builder_lite.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_message_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_message_field_lite.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_message_lite.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_name_resolver.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_primitive_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_primitive_field_lite.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_service.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_shared_code_generator.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_string_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/java/java_string_field_lite.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/js/js_generator.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/js/well_known_types_embed.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_enum.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_enum_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_extension.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_file.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_generator.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_helpers.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_map_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_message.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_message_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_oneof.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/objectivec/objectivec_primitive_field.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/php/php_generator.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/plugin.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/plugin.pb.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/python/python_generator.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/ruby/ruby_generator.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/subprocess.cc - ${protobuf_source_dir}/src/google/protobuf/compiler/zip_writer.cc -) - -add_library(_libprotoc ${libprotoc_files}) -target_link_libraries(_libprotoc _libprotobuf) -add_library(protobuf::libprotoc ALIAS _libprotoc) - -set(protoc_files ${protobuf_source_dir}/src/google/protobuf/compiler/main.cc) - -if (CMAKE_HOST_SYSTEM_NAME STREQUAL CMAKE_SYSTEM_NAME - AND CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL CMAKE_SYSTEM_PROCESSOR) - - add_executable(protoc ${protoc_files}) - target_link_libraries(protoc _libprotoc _libprotobuf pthread) - add_executable(protobuf::protoc ALIAS protoc) - - if (ENABLE_FUZZING) - # `protoc` will be built with sanitizer and it could fail during ClickHouse build - # It easily reproduces in oss-fuzz building pipeline - # To avoid this we can try to build `protoc` without any sanitizer with option `-fno-sanitize=all`, but - # it this case we will face with linker errors, because libcxx still will be built with sanitizer - # So, we can simply suppress all of these failures with a combination this flag and an environment variable - # export MSAN_OPTIONS=exit_code=0 - target_compile_options(protoc PRIVATE "-fsanitize-recover=all") - endif() -else () - # Build 'protoc' for host arch - set (PROTOC_BUILD_DIR "${protobuf_binary_dir}/build") - - if (NOT EXISTS "${PROTOC_BUILD_DIR}/protoc") - - # This is quite ugly but I cannot make dependencies work propery. - - execute_process( - COMMAND mkdir -p ${PROTOC_BUILD_DIR} - COMMAND_ECHO STDOUT) - - execute_process( - COMMAND ${CMAKE_COMMAND} - "-G${CMAKE_GENERATOR}" - "-DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}" - "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-Dprotobuf_BUILD_TESTS=0" - "-Dprotobuf_BUILD_CONFORMANCE=0" - "-Dprotobuf_BUILD_EXAMPLES=0" - "-Dprotobuf_BUILD_PROTOC_BINARIES=1" - "${protobuf_source_dir}/cmake" - WORKING_DIRECTORY "${PROTOC_BUILD_DIR}" - COMMAND_ECHO STDOUT) - - execute_process( - COMMAND ${CMAKE_COMMAND} --build "${PROTOC_BUILD_DIR}" - COMMAND_ECHO STDOUT) - endif () - -# add_custom_command ( -# OUTPUT ${PROTOC_BUILD_DIR} -# COMMAND mkdir -p ${PROTOC_BUILD_DIR}) -# -# add_custom_command ( -# OUTPUT "${PROTOC_BUILD_DIR}/CMakeCache.txt" -# -# COMMAND ${CMAKE_COMMAND} -# -G"${CMAKE_GENERATOR}" -# -DCMAKE_MAKE_PROGRAM="${CMAKE_MAKE_PROGRAM}" -# -DCMAKE_C_COMPILER="${CMAKE_C_COMPILER}" -# -DCMAKE_CXX_COMPILER="${CMAKE_CXX_COMPILER}" -# -Dprotobuf_BUILD_TESTS=0 -# -Dprotobuf_BUILD_CONFORMANCE=0 -# -Dprotobuf_BUILD_EXAMPLES=0 -# -Dprotobuf_BUILD_PROTOC_BINARIES=1 -# "${protobuf_source_dir}/cmake" -# -# DEPENDS "${PROTOC_BUILD_DIR}" -# WORKING_DIRECTORY "${PROTOC_BUILD_DIR}" -# COMMENT "Configuring 'protoc' for host architecture." -# USES_TERMINAL) -# -# add_custom_command ( -# OUTPUT "${PROTOC_BUILD_DIR}/protoc" -# COMMAND ${CMAKE_COMMAND} --build "${PROTOC_BUILD_DIR}" -# DEPENDS "${PROTOC_BUILD_DIR}/CMakeCache.txt" -# COMMENT "Building 'protoc' for host architecture." -# USES_TERMINAL) -# -# add_custom_target (protoc-host DEPENDS "${PROTOC_BUILD_DIR}/protoc") - - add_executable(protoc IMPORTED GLOBAL) - set_target_properties (protoc PROPERTIES IMPORTED_LOCATION "${PROTOC_BUILD_DIR}/protoc") - add_dependencies(protoc "${PROTOC_BUILD_DIR}/protoc") -endif () - -include("${ClickHouse_SOURCE_DIR}/contrib/protobuf-cmake/protobuf_generate.cmake") - -add_library(_protobuf INTERFACE) -target_link_libraries(_protobuf INTERFACE _libprotobuf) -target_include_directories(_protobuf INTERFACE "${Protobuf_INCLUDE_DIR}") -add_library(ch_contrib::protobuf ALIAS _protobuf) - -add_library(_protoc INTERFACE) -target_link_libraries(_protoc INTERFACE _libprotoc _libprotobuf) -target_include_directories(_protoc INTERFACE "${Protobuf_INCLUDE_DIR}") -add_library(ch_contrib::protoc ALIAS _protoc) diff --git a/contrib/protobuf-cmake/protobuf_generate.cmake b/contrib/protobuf-cmake/protobuf_generate.cmake deleted file mode 100644 index 3e30b4e40fd..00000000000 --- a/contrib/protobuf-cmake/protobuf_generate.cmake +++ /dev/null @@ -1,198 +0,0 @@ -# The code in this file was copied from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake - -#[[ -Add custom commands to process ``.proto`` files to C++:: - -protobuf_generate_cpp ( - [DESCRIPTORS ] [EXPORT_MACRO ] [...]) - -``SRCS`` - Variable to define with autogenerated source files -``HDRS`` - Variable to define with autogenerated header files -``DESCRIPTORS`` - Variable to define with autogenerated descriptor files, if requested. -``EXPORT_MACRO`` - is a macro which should expand to ``__declspec(dllexport)`` or - ``__declspec(dllimport)`` depending on what is being compiled. -``ARGN`` - ``.proto`` files -#]] - -function(PROTOBUF_GENERATE_CPP SRCS HDRS) - cmake_parse_arguments(protobuf_generate_cpp "" "EXPORT_MACRO;DESCRIPTORS" "" ${ARGN}) - - set(_proto_files "${protobuf_generate_cpp_UNPARSED_ARGUMENTS}") - if(NOT _proto_files) - message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP() called without any proto files") - return() - endif() - - if(PROTOBUF_GENERATE_CPP_APPEND_PATH) - set(_append_arg APPEND_PATH) - endif() - - if(protobuf_generate_cpp_DESCRIPTORS) - set(_descriptors DESCRIPTORS) - endif() - - if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) - set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") - endif() - - if(DEFINED Protobuf_IMPORT_DIRS) - set(_import_arg IMPORT_DIRS ${Protobuf_IMPORT_DIRS}) - endif() - - set(_outvar) - protobuf_generate(${_append_arg} ${_descriptors} LANGUAGE cpp EXPORT_MACRO ${protobuf_generate_cpp_EXPORT_MACRO} OUT_VAR _outvar ${_import_arg} PROTOS ${_proto_files}) - - set(${SRCS}) - set(${HDRS}) - if(protobuf_generate_cpp_DESCRIPTORS) - set(${protobuf_generate_cpp_DESCRIPTORS}) - endif() - - foreach(_file ${_outvar}) - if(_file MATCHES "cc$") - list(APPEND ${SRCS} ${_file}) - elseif(_file MATCHES "desc$") - list(APPEND ${protobuf_generate_cpp_DESCRIPTORS} ${_file}) - else() - list(APPEND ${HDRS} ${_file}) - endif() - endforeach() - set(${SRCS} ${${SRCS}} PARENT_SCOPE) - set(${HDRS} ${${HDRS}} PARENT_SCOPE) - if(protobuf_generate_cpp_DESCRIPTORS) - set(${protobuf_generate_cpp_DESCRIPTORS} "${${protobuf_generate_cpp_DESCRIPTORS}}" PARENT_SCOPE) - endif() -endfunction() - -# By default have PROTOBUF_GENERATE_CPP macro pass -I to protoc -# for each directory where a proto file is referenced. -if(NOT DEFINED PROTOBUF_GENERATE_CPP_APPEND_PATH) - set(PROTOBUF_GENERATE_CPP_APPEND_PATH TRUE) -endif() - -function(protobuf_generate) - set(_options APPEND_PATH DESCRIPTORS) - set(_singleargs LANGUAGE OUT_VAR EXPORT_MACRO PROTOC_OUT_DIR) - if(COMMAND target_sources) - list(APPEND _singleargs TARGET) - endif() - set(_multiargs PROTOS IMPORT_DIRS GENERATE_EXTENSIONS) - - cmake_parse_arguments(protobuf_generate "${_options}" "${_singleargs}" "${_multiargs}" "${ARGN}") - - if(NOT protobuf_generate_PROTOS AND NOT protobuf_generate_TARGET) - message(SEND_ERROR "Error: protobuf_generate called without any targets or source files") - return() - endif() - - if(NOT protobuf_generate_OUT_VAR AND NOT protobuf_generate_TARGET) - message(SEND_ERROR "Error: protobuf_generate called without a target or output variable") - return() - endif() - - if(NOT protobuf_generate_LANGUAGE) - set(protobuf_generate_LANGUAGE cpp) - endif() - string(TOLOWER ${protobuf_generate_LANGUAGE} protobuf_generate_LANGUAGE) - - if(NOT protobuf_generate_PROTOC_OUT_DIR) - set(protobuf_generate_PROTOC_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}) - endif() - - if(protobuf_generate_EXPORT_MACRO AND protobuf_generate_LANGUAGE STREQUAL cpp) - set(_dll_export_decl "dllexport_decl=${protobuf_generate_EXPORT_MACRO}:") - endif() - - if(NOT protobuf_generate_GENERATE_EXTENSIONS) - if(protobuf_generate_LANGUAGE STREQUAL cpp) - set(protobuf_generate_GENERATE_EXTENSIONS .pb.h .pb.cc) - elseif(protobuf_generate_LANGUAGE STREQUAL python) - set(protobuf_generate_GENERATE_EXTENSIONS _pb2.py) - else() - message(SEND_ERROR "Error: protobuf_generate given unknown Language ${LANGUAGE}, please provide a value for GENERATE_EXTENSIONS") - return() - endif() - endif() - - if(protobuf_generate_TARGET) - get_target_property(_source_list ${protobuf_generate_TARGET} SOURCES) - foreach(_file ${_source_list}) - if(_file MATCHES "proto$") - list(APPEND protobuf_generate_PROTOS ${_file}) - endif() - endforeach() - endif() - - if(NOT protobuf_generate_PROTOS) - message(SEND_ERROR "Error: protobuf_generate could not find any .proto files") - return() - endif() - - if(protobuf_generate_APPEND_PATH) - # Create an include path for each file specified - foreach(_file ${protobuf_generate_PROTOS}) - get_filename_component(_abs_file ${_file} ABSOLUTE) - get_filename_component(_abs_path ${_abs_file} PATH) - list(FIND _protobuf_include_path ${_abs_path} _contains_already) - if(${_contains_already} EQUAL -1) - list(APPEND _protobuf_include_path -I ${_abs_path}) - endif() - endforeach() - else() - set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) - endif() - - foreach(DIR ${protobuf_generate_IMPORT_DIRS}) - get_filename_component(ABS_PATH ${DIR} ABSOLUTE) - list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) - if(${_contains_already} EQUAL -1) - list(APPEND _protobuf_include_path -I ${ABS_PATH}) - endif() - endforeach() - - set(_generated_srcs_all) - foreach(_proto ${protobuf_generate_PROTOS}) - get_filename_component(_abs_file ${_proto} ABSOLUTE) - get_filename_component(_abs_dir ${_abs_file} DIRECTORY) - get_filename_component(_basename ${_proto} NAME_WE) - file(RELATIVE_PATH _rel_dir ${CMAKE_CURRENT_SOURCE_DIR} ${_abs_dir}) - - set(_possible_rel_dir) - if (NOT protobuf_generate_APPEND_PATH) - set(_possible_rel_dir ${_rel_dir}/) - endif() - - set(_generated_srcs) - foreach(_ext ${protobuf_generate_GENERATE_EXTENSIONS}) - list(APPEND _generated_srcs "${protobuf_generate_PROTOC_OUT_DIR}/${_possible_rel_dir}${_basename}${_ext}") - endforeach() - - if(protobuf_generate_DESCRIPTORS AND protobuf_generate_LANGUAGE STREQUAL cpp) - set(_descriptor_file "${CMAKE_CURRENT_BINARY_DIR}/${_basename}.desc") - set(_dll_desc_out "--descriptor_set_out=${_descriptor_file}") - list(APPEND _generated_srcs ${_descriptor_file}) - endif() - list(APPEND _generated_srcs_all ${_generated_srcs}) - - add_custom_command( - OUTPUT ${_generated_srcs} - COMMAND $ - ARGS --${protobuf_generate_LANGUAGE}_out ${_dll_export_decl}${protobuf_generate_PROTOC_OUT_DIR} ${_dll_desc_out} ${_protobuf_include_path} ${_abs_file} - DEPENDS ${_abs_file} protoc - COMMENT "Running ${protobuf_generate_LANGUAGE} protocol buffer compiler on ${_proto}" - VERBATIM) - endforeach() - - set_source_files_properties(${_generated_srcs_all} PROPERTIES GENERATED TRUE) - if(protobuf_generate_OUT_VAR) - set(${protobuf_generate_OUT_VAR} ${_generated_srcs_all} PARENT_SCOPE) - endif() - if(protobuf_generate_TARGET) - target_sources(${protobuf_generate_TARGET} PRIVATE ${_generated_srcs_all}) - endif() -endfunction() From f57c5105f6318573107ab6ae0d01fdccb757cf73 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 31 May 2023 17:02:05 +0000 Subject: [PATCH 0194/1072] Remove file deleted upstream from build description --- contrib/google-protobuf-cmake/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/contrib/google-protobuf-cmake/CMakeLists.txt b/contrib/google-protobuf-cmake/CMakeLists.txt index e2d38acb51d..2d72ac90f49 100644 --- a/contrib/google-protobuf-cmake/CMakeLists.txt +++ b/contrib/google-protobuf-cmake/CMakeLists.txt @@ -35,7 +35,6 @@ set(libprotobuf_lite_files ${protobuf_source_dir}/src/google/protobuf/arena.cc ${protobuf_source_dir}/src/google/protobuf/arenastring.cc ${protobuf_source_dir}/src/google/protobuf/extension_set.cc - ${protobuf_source_dir}/src/google/protobuf/field_access_listener.cc ${protobuf_source_dir}/src/google/protobuf/generated_enum_util.cc ${protobuf_source_dir}/src/google/protobuf/generated_message_table_driven_lite.cc ${protobuf_source_dir}/src/google/protobuf/generated_message_util.cc From 2e2f98ced69f0c6205292cee1290cab6795914c5 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 31 May 2023 17:26:31 +0000 Subject: [PATCH 0195/1072] Add new source file --- contrib/google-protobuf-cmake/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/google-protobuf-cmake/CMakeLists.txt b/contrib/google-protobuf-cmake/CMakeLists.txt index 2d72ac90f49..8afb86b25dd 100644 --- a/contrib/google-protobuf-cmake/CMakeLists.txt +++ b/contrib/google-protobuf-cmake/CMakeLists.txt @@ -85,6 +85,7 @@ set(libprotobuf_files ${protobuf_source_dir}/src/google/protobuf/empty.pb.cc ${protobuf_source_dir}/src/google/protobuf/extension_set_heavy.cc ${protobuf_source_dir}/src/google/protobuf/field_mask.pb.cc + ${protobuf_source_dir}/src/google/protobuf/generated_message_bases.cc ${protobuf_source_dir}/src/google/protobuf/generated_message_reflection.cc ${protobuf_source_dir}/src/google/protobuf/generated_message_table_driven.cc ${protobuf_source_dir}/src/google/protobuf/io/gzip_stream.cc From 0b62be649f9974a4433897b39fb8a59e9e7f30f2 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 31 May 2023 17:52:29 +0000 Subject: [PATCH 0196/1072] Add docs, fix style --- .../table-engines/integrations/hdfs.md | 6 ++ .../engines/table-engines/integrations/s3.md | 6 ++ docs/en/engines/table-engines/special/file.md | 8 ++ docs/en/engines/table-engines/special/url.md | 4 + docs/en/operations/settings/settings.md | 92 ++++++++++++++----- docs/en/sql-reference/table-functions/file.md | 12 ++- docs/en/sql-reference/table-functions/hdfs.md | 6 ++ docs/en/sql-reference/table-functions/s3.md | 6 ++ docs/en/sql-reference/table-functions/url.md | 4 + src/Storages/StorageURL.cpp | 4 - tests/integration/test_storage_s3/test.py | 20 ++-- 11 files changed, 130 insertions(+), 38 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/hdfs.md b/docs/en/engines/table-engines/integrations/hdfs.md index b9db0fae68f..b37ccb00ba6 100644 --- a/docs/en/engines/table-engines/integrations/hdfs.md +++ b/docs/en/engines/table-engines/integrations/hdfs.md @@ -233,6 +233,12 @@ libhdfs3 support HDFS namenode HA. - `_path` — Path to the file. - `_file` — Name of the file. +## Storage Settings {#storage-settings} + +- [hdfs_truncate_on_insert](/docs/en/operations/settings/settings.md#hdfs-truncate-on-insert) - allows to truncate file before insert into it. Disabled by default. +- [hdfs_create_multiple_files](/docs/en/operations/settings/settings.md#hdfs_allow_create_multiple_files) - allows to create a new file on each insert if format has suffix. Disabled by default. +- [hdfs_skip_empty_files](/docs/en/operations/settings/settings.md#hdfs_skip_empty_files) - allows to skip empty files while reading. Disabled by default. + **See Also** - [Virtual columns](../../../engines/table-engines/index.md#table_engines-virtual_columns) diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index 595bc0c344f..e8d0ab6255d 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -127,6 +127,12 @@ CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = S3('https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/{some,another}_folder/*', 'CSV'); ``` +## Storage Settings {#storage-settings} + +- [s3_truncate_on_insert](/docs/en/operations/settings/settings.md#s3-truncate-on-insert) - allows to truncate file before insert into it. Disabled by default. +- [s3_create_multiple_files](/docs/en/operations/settings/settings.md#s3_allow_create_multiple_files) - allows to create a new file on each insert if format has suffix. Disabled by default. +- [s3_skip_empty_files](/docs/en/operations/settings/settings.md#s3_skip_empty_files) - allows to skip empty files while reading. Disabled by default. + ## S3-related Settings {#settings} The following settings can be set before query execution or placed into configuration file. diff --git a/docs/en/engines/table-engines/special/file.md b/docs/en/engines/table-engines/special/file.md index 9c4e87487b4..cf325961b6a 100644 --- a/docs/en/engines/table-engines/special/file.md +++ b/docs/en/engines/table-engines/special/file.md @@ -92,3 +92,11 @@ $ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64 `PARTITION BY` — Optional. It is possible to create separate files by partitioning the data on a partition key. In most cases, you don't need a partition key, and if it is needed you generally don't need a partition key more granular than by month. Partitioning does not speed up queries (in contrast to the ORDER BY expression). You should never use too granular partitioning. Don't partition your data by client identifiers or names (instead, make client identifier or name the first column in the ORDER BY expression). For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](/docs/en/sql-reference/data-types/date.md). The partition names here have the `"YYYYMM"` format. + +## Settings {#settings} + +- [engine_file_empty_if_not_exists](/docs/en/operations/settings/settings.md#engine-file-emptyif-not-exists) - allows to select empty data from a file that doesn't exist. Disabled by default. +- [engine_file_truncate_on_insert](/docs/en/operations/settings/settings.md#engine-file-truncate-on-insert) - allows to truncate file before insert into it. Disabled by default. +- [engine_file_allow_create_multiple_files](/docs/en/operations/settings/settings.md#engine_file_allow_create_multiple_files) - allows to create a new file on each insert if format has suffix. Disabled by default. +- [engine_file_skip_empty_files](/docs/en/operations/settings/settings.md#engine_file_skip_empty_files) - allows to skip empty files while reading. Disabled by default. +- [storage_file_read_method](/docs/en/operations/settings/settings.md#engine-file-emptyif-not-exists) - method of reading data from storage file, one of: read, pread, mmap (only for clickhouse-local). Default value: `pread` for clickhouse-server, `mmap` for clickhouse-local. diff --git a/docs/en/engines/table-engines/special/url.md b/docs/en/engines/table-engines/special/url.md index a4530767e11..26d4975954f 100644 --- a/docs/en/engines/table-engines/special/url.md +++ b/docs/en/engines/table-engines/special/url.md @@ -102,3 +102,7 @@ SELECT * FROM url_engine_table `PARTITION BY` — Optional. It is possible to create separate files by partitioning the data on a partition key. In most cases, you don't need a partition key, and if it is needed you generally don't need a partition key more granular than by month. Partitioning does not speed up queries (in contrast to the ORDER BY expression). You should never use too granular partitioning. Don't partition your data by client identifiers or names (instead, make client identifier or name the first column in the ORDER BY expression). For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](/docs/en/sql-reference/data-types/date.md). The partition names here have the `"YYYYMM"` format. + +## Storage Settings {#storage-settings} + +- [engine_url_skip_empty_files](/docs/en/operations/settings/settings.md#engine_url_skip_empty_files) - allows to skip empty files while reading. Disabled by default. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 4f3b4e43358..ac3e624387e 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3195,27 +3195,7 @@ Possible values: Default value: `0`. -## s3_truncate_on_insert - -Enables or disables truncate before inserts in s3 engine tables. If disabled, an exception will be thrown on insert attempts if an S3 object already exists. - -Possible values: -- 0 — `INSERT` query appends new data to the end of the file. -- 1 — `INSERT` query replaces existing content of the file with the new data. - -Default value: `0`. - -## hdfs_truncate_on_insert - -Enables or disables truncation before an insert in hdfs engine tables. If disabled, an exception will be thrown on an attempt to insert if a file in HDFS already exists. - -Possible values: -- 0 — `INSERT` query appends new data to the end of the file. -- 1 — `INSERT` query replaces existing content of the file with the new data. - -Default value: `0`. - -## engine_file_allow_create_multiple_files +## engine_file_allow_create_multiple_files {#engine_file_allow_create_multiple_files} Enables or disables creating a new file on each insert in file engine tables if the format has the suffix (`JSON`, `ORC`, `Parquet`, etc.). If enabled, on each insert a new file will be created with a name following this pattern: @@ -3227,7 +3207,33 @@ Possible values: Default value: `0`. -## s3_create_new_file_on_insert +## engine_file_skip_empty_files {#engine_file_skip_empty_files} + +Enables or disables skipping empty files in [File](../../engines/table-engines/special/file.md) engine tables. + +Possible values: +- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. +- 1 — `SELECT` returns empty result for empty file. + +Default value: `0`. + +## storage_file_read_method {#storage_file_read_method} + +Method of reading data from storage file, one of: `read`, `pread`, `mmap`. The mmap method does not apply to clickhouse-server (it's intended for clickhouse-local). + +Default value: `pread` for clickhouse-server, `mmap` for clickhouse-local. + +## s3_truncate_on_insert {#s3_truncate_on_insert} + +Enables or disables truncate before inserts in s3 engine tables. If disabled, an exception will be thrown on insert attempts if an S3 object already exists. + +Possible values: +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` query replaces existing content of the file with the new data. + +Default value: `0`. + +## s3_create_new_file_on_insert {#s3_create_new_file_on_insert} Enables or disables creating a new file on each insert in s3 engine tables. If enabled, on each insert a new S3 object will be created with the key, similar to this pattern: @@ -3239,7 +3245,27 @@ Possible values: Default value: `0`. -## hdfs_create_new_file_on_insert +## s3_skip_empty_files {#s3_skip_empty_files} + +Enables or disables skipping empty files in [S3](../../engines/table-engines/special/s3.md) engine tables. + +Possible values: +- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. +- 1 — `SELECT` returns empty result for empty file. + +Default value: `0`. + +## hdfs_truncate_on_insert {#hdfs_truncate_on_insert} + +Enables or disables truncation before an insert in hdfs engine tables. If disabled, an exception will be thrown on an attempt to insert if a file in HDFS already exists. + +Possible values: +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` query replaces existing content of the file with the new data. + +Default value: `0`. + +## hdfs_create_new_file_on_insert {#hdfs_create_new_file_on_insert Enables or disables creating a new file on each insert in HDFS engine tables. If enabled, on each insert a new HDFS file will be created with the name, similar to this pattern: @@ -3251,6 +3277,26 @@ Possible values: Default value: `0`. +## hdfs_skip_empty_files {#hdfs_skip_empty_files} + +Enables or disables skipping empty files in [HDFS](../../engines/table-engines/special/hdfs.md) engine tables. + +Possible values: +- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. +- 1 — `SELECT` returns empty result for empty file. + +Default value: `0`. + +## engine_url_skip_empty_files {#engine_url_skip_empty_files} + +Enables or disables skipping empty files in [URL](../../engines/table-engines/special/url.md) engine tables. + +Possible values: +- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. +- 1 — `SELECT` returns empty result for empty file. + +Default value: `0`. + ## database_atomic_wait_for_drop_and_detach_synchronously {#database_atomic_wait_for_drop_and_detach_synchronously} Adds a modifier `SYNC` to all `DROP` and `DETACH` queries. diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index 28c2dc9f1f3..b1903c990b1 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -18,7 +18,7 @@ file(path [,format] [,structure] [,compression]) **Parameters** -- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Path to file support following globs in read-only mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc', 'def'` — strings. +- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-user_files_path). Path to file support following globs in read-only mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc', 'def'` — strings. - `format` — The [format](/docs/en/interfaces/formats.md#formats) of the file. - `structure` — Structure of the table. Format: `'column1_name column1_type, column2_name column2_type, ...'`. - `compression` — The existing compression type when used in a `SELECT` query, or the desired compression type when used in an `INSERT` query. The supported compression types are `gz`, `br`, `xz`, `zst`, `lz4`, and `bz2`. @@ -196,6 +196,16 @@ SELECT count(*) FROM file('big_dir/**/file002', 'CSV', 'name String, value UInt3 - `_path` — Path to the file. - `_file` — Name of the file. +## Settings + +- [engine_file_empty_if_not_exists](/docs/en/operations/settings/settings.md#engine-file-emptyif-not-exists) - allows to select empty data from a file that doesn't exist. Disabled by default. +- [engine_file_truncate_on_insert](/docs/en/operations/settings/settings.md#engine-file-truncate-on-insert) - allows to truncate file before insert into it. Disabled by default. +- [engine_file_allow_create_multiple_files](/docs/en/operations/settings/settings.md#engine_file_allow_create_multiple_files) - allows to create a new file on each insert if format has suffix. Disabled by default. +- [engine_file_skip_empty_files](/docs/en/operations/settings/settings.md#engine_file_skip_empty_files) - allows to skip empty files while reading. Disabled by default. +- [storage_file_read_method](/docs/en/operations/settings/settings.md#engine-file-emptyif-not-exists) - method of reading data from storage file, one of: read, pread, mmap (only for clickhouse-local). Default value: `pread` for clickhouse-server, `mmap` for clickhouse-local. + + + **See Also** - [Virtual columns](/docs/en/engines/table-engines/index.md#table_engines-virtual_columns) diff --git a/docs/en/sql-reference/table-functions/hdfs.md b/docs/en/sql-reference/table-functions/hdfs.md index 6ba24211131..1b52e786de4 100644 --- a/docs/en/sql-reference/table-functions/hdfs.md +++ b/docs/en/sql-reference/table-functions/hdfs.md @@ -97,6 +97,12 @@ FROM hdfs('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name Strin - `_path` — Path to the file. - `_file` — Name of the file. +## Storage Settings {#storage-settings} + +- [hdfs_truncate_on_insert](/docs/en/operations/settings/settings.md#hdfs-truncate-on-insert) - allows to truncate file before insert into it. Disabled by default. +- [hdfs_create_multiple_files](/docs/en/operations/settings/settings.md#hdfs_allow_create_multiple_files) - allows to create a new file on each insert if format has suffix. Disabled by default. +- [hdfs_skip_empty_files](/docs/en/operations/settings/settings.md#hdfs_skip_empty_files) - allows to skip empty files while reading. Disabled by default. + **See Also** - [Virtual columns](../../engines/table-engines/index.md#table_engines-virtual_columns) diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index a9ddc286ec5..7068c208022 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -202,6 +202,12 @@ FROM s3( LIMIT 5; ``` +## Storage Settings {#storage-settings} + +- [s3_truncate_on_insert](/docs/en/operations/settings/settings.md#s3-truncate-on-insert) - allows to truncate file before insert into it. Disabled by default. +- [s3_create_multiple_files](/docs/en/operations/settings/settings.md#s3_allow_create_multiple_files) - allows to create a new file on each insert if format has suffix. Disabled by default. +- [s3_skip_empty_files](/docs/en/operations/settings/settings.md#s3_skip_empty_files) - allows to skip empty files while reading. Disabled by default. + **See Also** - [S3 engine](../../engines/table-engines/integrations/s3.md) diff --git a/docs/en/sql-reference/table-functions/url.md b/docs/en/sql-reference/table-functions/url.md index f157a850a12..ac4162c15de 100644 --- a/docs/en/sql-reference/table-functions/url.md +++ b/docs/en/sql-reference/table-functions/url.md @@ -53,6 +53,10 @@ Character `|` inside patterns is used to specify failover addresses. They are it - `_path` — Path to the `URL`. - `_file` — Resource name of the `URL`. +## Storage Settings {#storage-settings} + +- [engine_url_skip_empty_files](/docs/en/operations/settings/settings.md#engine_url_skip_empty_files) - allows to skip empty files while reading. Disabled by default. + **See Also** - [Virtual columns](/docs/en/engines/table-engines/index.md#table_engines-virtual_columns) diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 706ce481a24..e882138bf0d 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -14,10 +14,8 @@ #include #include -#include #include #include -#include #include #include @@ -29,7 +27,6 @@ #include #include #include -#include #include #include @@ -48,7 +45,6 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int NETWORK_ERROR; extern const int BAD_ARGUMENTS; - extern const int LOGICAL_ERROR; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 516c8ed152a..174cbad1de4 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -1719,7 +1719,6 @@ def test_skip_empty_files(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] - instance.query( f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet', TSVRaw) select * from numbers(0) settings s3_truncate_on_insert=1" ) @@ -1727,43 +1726,44 @@ def test_skip_empty_files(started_cluster): instance.query( f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files2.parquet') select * from numbers(1) settings s3_truncate_on_insert=1" ) + def test(engine, setting): instance.query_and_get_error( f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet') settings {setting}=0" ) - + instance.query_and_get_error( f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet', auto, 'number UINt64') settings {setting}=0" ) - + instance.query_and_get_error( f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet') settings {setting}=1" ) - + res = instance.query( f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files1.parquet', auto, 'number UInt64') settings {setting}=1" ) - + assert len(res) == 0 instance.query_and_get_error( f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1,2}}.parquet') settings {setting}=0" ) - + instance.query_and_get_error( f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1,2}}.parquet', auto, 'number UInt64') settings {setting}=0" ) - + res = instance.query( f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1,2}}.parquet') settings {setting}=1" ) - + assert int(res) == 0 - + res = instance.query( f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/skip_empty_files{{1,2}}.parquet', auto, 'number UInt64') settings {setting}=1" ) - + assert int(res) == 0 test("s3", "s3_skip_empty_files") From c9626314f7e7b128eccfd5fca7e04c9ff1fc45c5 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 31 May 2023 19:22:44 +0000 Subject: [PATCH 0197/1072] Better --- contrib/capnproto | 2 +- src/Formats/CapnProtoSerializer.cpp | 1028 ++++++++--------- ...lumnsStructureToQueryWithClusterEngine.cpp | 52 + ...ColumnsStructureToQueryWithClusterEngine.h | 14 + 4 files changed, 529 insertions(+), 567 deletions(-) create mode 100644 src/Storages/addColumnsStructureToQueryWithClusterEngine.cpp create mode 100644 src/Storages/addColumnsStructureToQueryWithClusterEngine.h diff --git a/contrib/capnproto b/contrib/capnproto index dc8b50b9997..976209a6d18 160000 --- a/contrib/capnproto +++ b/contrib/capnproto @@ -1 +1 @@ -Subproject commit dc8b50b999777bcb23c89bb5907c785c3f654441 +Subproject commit 976209a6d18074804f60d18ef99b6a809d27dadf diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp index 91e207a1846..e36f5fa4947 100644 --- a/src/Formats/CapnProtoSerializer.cpp +++ b/src/Formats/CapnProtoSerializer.cpp @@ -94,9 +94,21 @@ namespace std::vector> field_builders; }; + template + std::unique_ptr initStructBuilder(ParentBuilder & parent_builder, UInt32 offset_or_index, const capnp::_::StructSize & struct_size, size_t elements, const capnp::StructSchema & schema) + { + capnp::DynamicStruct::Builder builder_impl; + if constexpr (std::is_same_v) + builder_impl = capnp::DynamicStruct::Builder(schema, parent_builder.getBuilderImpl().getPointerField(offset_or_index).initStruct(struct_size)); + else + builder_impl = capnp::DynamicStruct::Builder(schema, parent_builder.getBuilderImpl().getStructElement(offset_or_index)); + return std::make_unique(std::move(builder_impl), elements); + } + class ICapnProtoSerializer { public: + /// Write row as struct field. virtual void writeRow( const ColumnPtr & column, std::unique_ptr & builder, @@ -104,6 +116,7 @@ namespace UInt32 slot_offset, size_t row_num) = 0; + /// Write row as list element. virtual void writeRow( const ColumnPtr & column, std::unique_ptr & builder, @@ -111,8 +124,10 @@ namespace UInt32 array_index, size_t row_num) = 0; + /// Read row from struct field at slot_offset. virtual void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) = 0; + /// Read row from list element at array_index. virtual void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) = 0; virtual ~ICapnProtoSerializer() = default; @@ -124,32 +139,32 @@ namespace public: void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - auto & builder_impl = parent_struct_builder.getBuilderImpl(); - CapnProtoNumericType value = static_cast(assert_cast &>(*column).getElement(row_num)); - builder_impl.setDataField(slot_offset, value); + parent_struct_builder.getBuilderImpl().setDataField(slot_offset, getValue(column, row_num)); } void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - auto & builder_impl = parent_list_builder.getBuilderImpl(); - CapnProtoNumericType value = static_cast(assert_cast &>(*column).getElement(row_num)); - builder_impl.setDataElement(array_index, value); + parent_list_builder.getBuilderImpl().setDataElement(array_index, getValue(column, row_num)); } void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { - const auto & reader_impl = parent_struct_reader.getReaderImpl(); - CapnProtoNumericType value = reader_impl.getDataField(slot_offset); - if constexpr (convert_to_bool_on_read) - assert_cast(column).insertValue(static_cast(value)); - else - assert_cast &>(column).insertValue(static_cast(value)); + insertValue(column, parent_struct_reader.getReaderImpl().getDataField(slot_offset)); } void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override { - const auto & reader_impl = parent_list_reader.getReaderImpl(); - CapnProtoNumericType value = reader_impl.getDataElement(array_index); + insertValue(column, parent_list_reader.getReaderImpl().getDataElement(array_index)); + } + + private: + CapnProtoNumericType getValue(const ColumnPtr & column, size_t row_num) + { + return static_cast(assert_cast &>(*column).getElement(row_num)); + } + + void insertValue(IColumn & column, CapnProtoNumericType value) + { if constexpr (convert_to_bool_on_read) assert_cast(column).insertValue(static_cast(value)); else @@ -191,29 +206,32 @@ namespace public: void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - auto & builder_impl = parent_struct_builder.getBuilderImpl(); - CapnProtoFloatType value = static_cast(assert_cast &>(*column).getElement(row_num)); - builder_impl.setDataField(slot_offset, value); + parent_struct_builder.getBuilderImpl().setDataField(slot_offset, getValue(column, row_num)); } void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - auto & builder_impl = parent_list_builder.getBuilderImpl(); - CapnProtoFloatType value = static_cast(assert_cast &>(*column).getElement(row_num)); - builder_impl.setDataElement(array_index, value); + parent_list_builder.getBuilderImpl().setDataElement(array_index, getValue(column, row_num)); } void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { - const auto & reader_impl = parent_struct_reader.getReaderImpl(); - CapnProtoFloatType value = reader_impl.getDataField(slot_offset); - assert_cast &>(column).insertValue(static_cast(value)); + insertValue(column, parent_struct_reader.getReaderImpl().getDataField(slot_offset)); } void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override { - const auto & reader_impl = parent_list_reader.getReaderImpl(); - CapnProtoFloatType value = reader_impl.getDataElement(array_index); + insertValue(column, parent_list_reader.getReaderImpl().getDataElement(array_index)); + } + + private: + CapnProtoFloatType getValue(const ColumnPtr & column, size_t row_num) + { + return static_cast(assert_cast &>(*column).getElement(row_num)); + } + + void insertValue(IColumn & column, CapnProtoFloatType value) + { assert_cast &>(column).insertValue(static_cast(value)); } }; @@ -298,57 +316,41 @@ namespace void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - auto & builder_impl = parent_struct_builder.getBuilderImpl(); - EnumType enum_value = assert_cast &>(*column).getElement(row_num); - UInt16 capnp_value; - if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) - capnp_value = static_cast(enum_value); - else - capnp_value = ch_to_capnp_values[enum_value]; - - builder_impl.setDataField(slot_offset, capnp_value); + parent_struct_builder.getBuilderImpl().setDataField(slot_offset, getValue(column, row_num)); } void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - auto & builder_impl = parent_list_builder.getBuilderImpl(); - EnumType enum_value = assert_cast &>(*column).getElement(row_num); - UInt16 capnp_value; - if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) - capnp_value = static_cast(enum_value); - else - capnp_value = ch_to_capnp_values[enum_value]; - - builder_impl.setDataElement(array_index, capnp_value); + parent_list_builder.getBuilderImpl().setDataElement(array_index, getValue(column, row_num)); } void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { - const auto & reader_impl = parent_struct_reader.getReaderImpl(); - UInt16 capnp_value = reader_impl.getDataField(slot_offset); - EnumType value; - if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) - value = static_cast(capnp_value); - else - value = capnp_to_ch_values[capnp_value]; - - assert_cast &>(column).insertValue(value); + insertValue(column, parent_struct_reader.getReaderImpl().getDataField(slot_offset)); } void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override { - const auto & reader_impl = parent_list_reader.getReaderImpl(); - UInt16 capnp_value = reader_impl.getDataElement(array_index); - EnumType value; - if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) - value = static_cast(capnp_value); - else - value = capnp_to_ch_values[capnp_value]; - - assert_cast &>(column).insertValue(value); + insertValue(column, parent_list_reader.getReaderImpl().getDataElement(array_index)); } private: + UInt16 getValue(const ColumnPtr & column, size_t row_num) + { + EnumType enum_value = assert_cast &>(*column).getElement(row_num); + if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) + return static_cast(enum_value); + return ch_to_capnp_values[enum_value]; + } + + void insertValue(IColumn & column, UInt16 capnp_enum_value) + { + if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) + assert_cast &>(column).insertValue(static_cast(capnp_enum_value)); + else + assert_cast &>(column).insertValue(capnp_to_ch_values[capnp_enum_value]); + } + DataTypePtr data_type; capnp::EnumSchema enum_schema; const FormatSettings::CapnProtoEnumComparingMode enum_comparing_mode; @@ -367,29 +369,32 @@ namespace void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - auto & builder_impl = parent_struct_builder.getBuilderImpl(); - UInt16 value = assert_cast(*column).getElement(row_num); - builder_impl.setDataField(slot_offset, value); + parent_struct_builder.getBuilderImpl().setDataField(slot_offset, getValue(column, row_num)); } void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - auto & builder_impl = parent_list_builder.getBuilderImpl(); - UInt16 value = assert_cast(*column).getElement(row_num); - builder_impl.setDataElement(array_index, value); + parent_list_builder.getBuilderImpl().setDataElement(array_index, getValue(column, row_num)); } void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { - const auto & reader_impl = parent_struct_reader.getReaderImpl(); - UInt16 value = reader_impl.getDataField(slot_offset); - assert_cast(column).insertValue(value); + insertValue(column, parent_struct_reader.getReaderImpl().getDataField(slot_offset)); } void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override { - const auto & reader_impl = parent_list_reader.getReaderImpl(); - UInt16 value = reader_impl.getDataElement(array_index); + insertValue(column, parent_list_reader.getReaderImpl().getDataElement(array_index)); + } + + private: + UInt16 getValue(const ColumnPtr & column, size_t row_num) + { + return assert_cast(*column).getElement(row_num); + } + + void insertValue(IColumn & column, UInt16 value) + { assert_cast(column).insertValue(value); } }; @@ -405,29 +410,32 @@ namespace void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - auto & builder_impl = parent_struct_builder.getBuilderImpl(); - Int32 value = assert_cast(*column).getElement(row_num); - builder_impl.setDataField(slot_offset, value); + parent_struct_builder.getBuilderImpl().setDataField(slot_offset, getValue(column, row_num)); } void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - auto & builder_impl = parent_list_builder.getBuilderImpl(); - Int32 value = assert_cast(*column).getElement(row_num); - builder_impl.setDataElement(array_index, value); + parent_list_builder.getBuilderImpl().setDataElement(array_index, getValue(column, row_num)); } void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { - const auto & reader_impl = parent_struct_reader.getReaderImpl(); - Int32 value = reader_impl.getDataField(slot_offset); - assert_cast(column).insertValue(value); + insertValue(column, parent_struct_reader.getReaderImpl().getDataField(slot_offset)); } void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override { - const auto & reader_impl = parent_list_reader.getReaderImpl(); - Int32 value = reader_impl.getDataElement(array_index); + insertValue(column, parent_list_reader.getReaderImpl().getDataElement(array_index)); + } + + private: + Int32 getValue(const ColumnPtr & column, size_t row_num) + { + return assert_cast(*column).getElement(row_num); + } + + void insertValue(IColumn & column, Int32 value) + { assert_cast(column).insertValue(value); } }; @@ -443,29 +451,32 @@ namespace void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - auto & builder_impl = parent_struct_builder.getBuilderImpl(); - UInt32 value = assert_cast(*column).getElement(row_num); - builder_impl.setDataField(slot_offset, value); + parent_struct_builder.getBuilderImpl().setDataField(slot_offset, getValue(column, row_num)); } void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - auto & builder_impl = parent_list_builder.getBuilderImpl(); - UInt32 value = assert_cast(*column).getElement(row_num); - builder_impl.setDataElement(array_index, value); + parent_list_builder.getBuilderImpl().setDataElement(array_index, getValue(column, row_num)); } void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { - const auto & reader_impl = parent_struct_reader.getReaderImpl(); - UInt32 value = reader_impl.getDataField(slot_offset); - assert_cast(column).insertValue(value); + insertValue(column, parent_struct_reader.getReaderImpl().getDataField(slot_offset)); } void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override { - const auto & reader_impl = parent_list_reader.getReaderImpl(); - UInt32 value = reader_impl.getDataElement(array_index); + insertValue(column, parent_list_reader.getReaderImpl().getDataElement(array_index)); + } + + private: + UInt32 getValue(const ColumnPtr & column, size_t row_num) + { + return assert_cast(*column).getElement(row_num); + } + + void insertValue(IColumn & column, UInt32 value) + { assert_cast(column).insertValue(value); } }; @@ -481,29 +492,32 @@ namespace void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - auto & builder_impl = parent_struct_builder.getBuilderImpl(); - Int64 value = assert_cast(*column).getElement(row_num); - builder_impl.setDataField(slot_offset, value); + parent_struct_builder.getBuilderImpl().setDataField(slot_offset, getValue(column, row_num)); } void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - auto & builder_impl = parent_list_builder.getBuilderImpl(); - Int64 value = assert_cast(*column).getElement(row_num); - builder_impl.setDataElement(array_index, value); + parent_list_builder.getBuilderImpl().setDataElement(array_index, getValue(column, row_num)); } void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { - const auto & reader_impl = parent_struct_reader.getReaderImpl(); - Int64 value = reader_impl.getDataField(slot_offset); - assert_cast(column).insertValue(value); + insertValue(column, parent_struct_reader.getReaderImpl().getDataField(slot_offset)); } void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override { - const auto & reader_impl = parent_list_reader.getReaderImpl(); - Int64 value = reader_impl.getDataElement(array_index); + insertValue(column, parent_list_reader.getReaderImpl().getDataElement(array_index)); + } + + private: + Int64 getValue(const ColumnPtr & column, size_t row_num) + { + return assert_cast(*column).getElement(row_num); + } + + void insertValue(IColumn & column, Int64 value) + { assert_cast(column).insertValue(value); } }; @@ -523,275 +537,36 @@ namespace void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - auto & builder_impl = parent_struct_builder.getBuilderImpl(); - DecimalType value = assert_cast &>(*column).getElement(row_num); - builder_impl.setDataField(slot_offset, value); + parent_struct_builder.getBuilderImpl().setDataField(slot_offset, getValue(column, row_num)); } void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - auto & builder_impl = parent_list_builder.getBuilderImpl(); - DecimalType value = assert_cast &>(*column).getElement(row_num); - builder_impl.setDataElement(array_index, value); + parent_list_builder.getBuilderImpl().setDataElement(array_index, getValue(column, row_num)); } void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { - const auto & reader_impl = parent_struct_reader.getReaderImpl(); - NativeType value = reader_impl.getDataField(slot_offset); - assert_cast &>(column).insertValue(value); + insertValue(column, parent_struct_reader.getReaderImpl().getDataField(slot_offset)); } void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override { - const auto & reader_impl = parent_list_reader.getReaderImpl(); - NativeType value = reader_impl.getDataElement(array_index); + insertValue(column, parent_list_reader.getReaderImpl().getDataElement(array_index)); + } + + private: + NativeType getValue(const ColumnPtr & column, size_t row_num) + { + return assert_cast &>(*column).getElement(row_num); + } + + void insertValue(IColumn & column, NativeType value) + { assert_cast &>(column).insertValue(value); } }; - template - class CapnProtoFixedSizeRawDataSerializer : public ICapnProtoSerializer - { - private: - static constexpr size_t value_size = sizeof(T); - - public: - CapnProtoFixedSizeRawDataSerializer(const DataTypePtr & data_type_, const String & column_name, const capnp::Type & capnp_type) : data_type(data_type_) - { - if (!capnp_type.isData()) - throwCannotConvert(data_type, column_name, capnp_type); - } - - void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override - { - auto & builder_impl = parent_struct_builder.getBuilderImpl(); - auto data = column->getDataAt(row_num); - capnp::Data::Reader value = capnp::Data::Reader(reinterpret_cast(data.data), data.size); - builder_impl.getPointerField(slot_offset).template setBlob(value); - } - - void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_struct_builder, UInt32 array_index, size_t row_num) override - { - auto & builder_impl = parent_struct_builder.getBuilderImpl(); - auto data = column->getDataAt(row_num); - capnp::Data::Reader value = capnp::Data::Reader(reinterpret_cast(data.data), data.size); - builder_impl.getPointerElement(array_index).setBlob(value); - } - - void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override - { - const auto & reader_impl = parent_struct_reader.getReaderImpl(); - capnp::Data::Reader value = reader_impl.getPointerField(slot_offset).template getBlob(nullptr, 0); - if (value.size() != value_size) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", data_type->getName(), value.size()); - - column.insertData(reinterpret_cast(value.begin()), value.size()); - } - - void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override - { - const auto & reader_impl = parent_list_reader.getReaderImpl(); - capnp::Data::Reader value = reader_impl.getPointerElement(array_index).getBlob(nullptr, 0); - if (value.size() != value_size) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", data_type->getName(), value.size()); - - column.insertData(reinterpret_cast(value.begin()), value.size()); - } - - private: - DataTypePtr data_type; - }; - - template - class CapnProtoStringSerializer : public ICapnProtoSerializer - { - public: - CapnProtoStringSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) - { - if (!capnp_type.isData() && !capnp_type.isText()) - throwCannotConvert(data_type, column_name, capnp_type); - } - - void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override - { - auto & builder_impl = parent_struct_builder.getBuilderImpl(); - auto data = column->getDataAt(row_num); - if constexpr (is_binary) - { - capnp::Data::Reader value = capnp::Data::Reader(reinterpret_cast(data.data), data.size); - builder_impl.getPointerField(slot_offset).setBlob(value); - } - else - { - capnp::Text::Reader value = capnp::Text::Reader(data.data, data.size); - builder_impl.getPointerField(slot_offset).setBlob(value); - } - } - - void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_struct_builder, UInt32 array_index, size_t row_num) override - { - auto & builder_impl = parent_struct_builder.getBuilderImpl(); - auto data = column->getDataAt(row_num); - if constexpr (is_binary) - { - capnp::Data::Reader value = capnp::Data::Reader(reinterpret_cast(data.data), data.size); - builder_impl.getPointerElement(array_index).setBlob(value); - } - else - { - capnp::Text::Reader value = capnp::Text::Reader(data.data, data.size); - builder_impl.getPointerElement(array_index).setBlob(value); - } - } - - void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override - { - const auto & reader_impl = parent_struct_reader.getReaderImpl(); - if constexpr (is_binary) - { - capnp::Data::Reader value = reader_impl.getPointerField(slot_offset).getBlob(nullptr, 0); - column.insertData(reinterpret_cast(value.begin()), value.size()); - } - else - { - capnp::Text::Reader value = reader_impl.getPointerField(slot_offset).getBlob(nullptr, 0); - column.insertData(reinterpret_cast(value.begin()), value.size()); - } - } - - void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override - { - const auto & reader_impl = parent_list_reader.getReaderImpl(); - if constexpr (is_binary) - { - capnp::Data::Reader value = reader_impl.getPointerElement(array_index).getBlob(nullptr, 0); - column.insertData(reinterpret_cast(value.begin()), value.size()); - } - else - { - capnp::Text::Reader value = reader_impl.getPointerElement(array_index).getBlob(nullptr, 0); - column.insertData(reinterpret_cast(value.begin()), value.size()); - } - } - }; - - template - class CapnProtoFixedStringSerializer : public ICapnProtoSerializer - { - public: - CapnProtoFixedStringSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type_) : capnp_type(capnp_type_) - { - if (!capnp_type.isData() && !capnp_type.isText()) - throwCannotConvert(data_type, column_name, capnp_type); - } - - void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override - { - auto & builder_impl = parent_struct_builder.getBuilderImpl(); - auto data = column->getDataAt(row_num); - if constexpr (is_binary) - { - capnp::Data::Reader value = capnp::Data::Reader(reinterpret_cast(data.data), data.size); - builder_impl.getPointerField(slot_offset).setBlob(value); - } - else - { - if (data.data[data.size - 1] == 0) - { - capnp::Text::Reader value = capnp::Text::Reader(data.data, data.size); - builder_impl.getPointerField(slot_offset).setBlob(value); - } - else - { - /// In TEXT type data should be null-terminated, but ClickHouse FixedString data could not be. - /// To make data null-terminated we should copy it to temporary String object and use it in capnp::Text::Reader. - /// Note that capnp::Text::Reader works only with pointer to the data and it's size, so we should - /// guarantee that new String object life time is longer than capnp::Text::Reader life time. - tmp_string = data.toString(); - capnp::Text::Reader value = capnp::Text::Reader(tmp_string.data(), tmp_string.size()); - builder_impl.getPointerField(slot_offset).setBlob(value); - } - } - } - - void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_struct_builder, UInt32 array_index, size_t row_num) override - { - auto & builder_impl = parent_struct_builder.getBuilderImpl(); - auto data = column->getDataAt(row_num); - if constexpr (is_binary) - { - capnp::Data::Reader value = capnp::Data::Reader(reinterpret_cast(data.data), data.size); - builder_impl.getPointerElement(array_index).setBlob(value); - } - else - { - if (data.data[data.size - 1] == 0) - { - capnp::Text::Reader value = capnp::Text::Reader(data.data, data.size); - builder_impl.getPointerElement(array_index).setBlob(value); - } - else - { - /// In TEXT type data should be null-terminated, but ClickHouse FixedString data could not be. - /// To make data null-terminated we should copy it to temporary String object and use it in capnp::Text::Reader. - /// Note that capnp::Text::Reader works only with pointer to the data and it's size, so we should - /// guarantee that new String object life time is longer than capnp::Text::Reader life time. - tmp_string = data.toString(); - capnp::Text::Reader value = capnp::Text::Reader(tmp_string.data(), tmp_string.size()); - builder_impl.getPointerElement(array_index).setBlob(value); - } - } - } - - void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override - { - const auto & reader_impl = parent_struct_reader.getReaderImpl(); - auto & fixed_string_column = assert_cast(column); - if constexpr (is_binary) - { - capnp::Data::Reader value = reader_impl.getPointerField(slot_offset).getBlob(nullptr, 0); - if (value.size() > fixed_string_column.getN()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot read data with size {} to FixedString with size {}", value.size(), fixed_string_column.getN()); - - fixed_string_column.insertData(reinterpret_cast(value.begin()), value.size()); - } - else - { - capnp::Text::Reader value = reader_impl.getPointerField(slot_offset).getBlob(nullptr, 0); - if (value.size() > fixed_string_column.getN()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot read data with size {} to FixedString with size {}", value.size(), fixed_string_column.getN()); - - fixed_string_column.insertData(reinterpret_cast(value.begin()), value.size()); - } - } - - void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override - { - const auto & reader_impl = parent_list_reader.getReaderImpl(); - auto & fixed_string_column = assert_cast(column); - if constexpr (is_binary) - { - capnp::Data::Reader value = reader_impl.getPointerElement(array_index).getBlob(nullptr, 0); - if (value.size() > fixed_string_column.getN()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot read data with size {} to FixedString with size {}", value.size(), fixed_string_column.getN()); - - fixed_string_column.insertData(reinterpret_cast(value.begin()), value.size()); - } - else - { - capnp::Text::Reader value = reader_impl.getPointerElement(array_index).getBlob(nullptr, 0); - if (value.size() > fixed_string_column.getN()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot read data with size {} to FixedString with size {}", value.size(), fixed_string_column.getN()); - - fixed_string_column.insertData(reinterpret_cast(value.begin()), value.size()); - } - } - - private: - String tmp_string; - capnp::Type capnp_type; - }; class CapnProtoIPv4Serializer : public ICapnProtoSerializer { @@ -804,33 +579,204 @@ namespace void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - auto & builder_impl = parent_struct_builder.getBuilderImpl(); - UInt32 value = assert_cast(*column).getElement(row_num); - builder_impl.setDataField(slot_offset, value); + parent_struct_builder.getBuilderImpl().setDataField(slot_offset, getValue(column, row_num)); } void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - auto & builder_impl = parent_list_builder.getBuilderImpl(); - UInt32 value = assert_cast(*column).getElement(row_num); - builder_impl.setDataElement(array_index, value); + parent_list_builder.getBuilderImpl().setDataElement(array_index, getValue(column, row_num)); } void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { - const auto & reader_impl = parent_struct_reader.getReaderImpl(); - UInt32 value = reader_impl.getDataField(slot_offset); - assert_cast(column).insertValue(IPv4(value)); + insertValue(column, parent_struct_reader.getReaderImpl().getDataField(slot_offset)); } void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override { - const auto & reader_impl = parent_list_reader.getReaderImpl(); - UInt32 value = reader_impl.getDataElement(array_index); + insertValue(column, parent_list_reader.getReaderImpl().getDataElement(array_index)); + } + + private: + UInt32 getValue(const ColumnPtr & column, size_t row_num) + { + return assert_cast(*column).getElement(row_num); + } + + void insertValue(IColumn & column, UInt32 value) + { assert_cast(column).insertValue(IPv4(value)); } }; + template + class CapnProtoFixedSizeRawDataSerializer : public ICapnProtoSerializer + { + private: + static constexpr size_t expected_value_size = sizeof(T); + + public: + CapnProtoFixedSizeRawDataSerializer(const DataTypePtr & data_type_, const String & column_name, const capnp::Type & capnp_type) : data_type(data_type_) + { + if (!capnp_type.isData()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().getPointerField(slot_offset).setBlob(getData(column, row_num)); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_struct_builder, UInt32 array_index, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().getPointerElement(array_index).setBlob(getData(column, row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + insertData(column, parent_struct_reader.getReaderImpl().getPointerField(slot_offset).getBlob(nullptr, 0)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + insertData(column, parent_list_reader.getReaderImpl().getPointerElement(array_index).getBlob(nullptr, 0)); + } + + private: + capnp::Data::Reader getData(const ColumnPtr & column, size_t row_num) + { + auto data = column->getDataAt(row_num); + return capnp::Data::Reader(reinterpret_cast(data.data), data.size); + } + + void insertData(IColumn & column, capnp::Data::Reader data) + { + if (data.size() != expected_value_size) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", data_type->getName(), data.size()); + + column.insertData(reinterpret_cast(data.begin()), data.size()); + } + + DataTypePtr data_type; + }; + + template + class CapnProtoStringSerializer : public ICapnProtoSerializer + { + public: + CapnProtoStringSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isData() && !capnp_type.isText()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().getPointerField(slot_offset).setBlob(getData(column, row_num)); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_struct_builder, UInt32 array_index, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().getPointerElement(array_index).setBlob(getData(column, row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + insertData(column, parent_struct_reader.getReaderImpl().getPointerField(slot_offset).getBlob(nullptr, 0)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + insertData(column, parent_list_reader.getReaderImpl().getPointerElement(array_index).getBlob(nullptr, 0)); + } + + private: + using Reader = typename CapnpType::Reader; + + CapnpType::Reader getData(const ColumnPtr & column, size_t row_num) + { + auto data = column->getDataAt(row_num); + if constexpr (std::is_same_v) + return Reader(reinterpret_cast(data.data), data.size); + else + return Reader(data.data, data.size); + } + + void insertData(IColumn & column, Reader data) + { + column.insertData(reinterpret_cast(data.begin()), data.size()); + } + }; + + template + class CapnProtoFixedStringSerializer : public ICapnProtoSerializer + { + private: + + public: + CapnProtoFixedStringSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type_) : capnp_type(capnp_type_) + { + if (!capnp_type.isData() && !capnp_type.isText()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().getPointerField(slot_offset).setBlob(getData(column, row_num)); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_struct_builder, UInt32 array_index, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().getPointerElement(array_index).setBlob(getData(column, row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + insertData(column, parent_struct_reader.getReaderImpl().getPointerField(slot_offset).getBlob(nullptr, 0)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + insertData(column, parent_list_reader.getReaderImpl().getPointerElement(array_index).getBlob(nullptr, 0)); + } + + private: + using Reader = typename CapnpType::Reader; + + CapnpType::Reader getData(const ColumnPtr & column, size_t row_num) + { + auto data = column->getDataAt(row_num); + if constexpr (std::is_same_v) + { + return Reader(reinterpret_cast(data.data), data.size); + } + else + { + if (data.data[data.size - 1] == 0) + return Reader(data.data, data.size); + + /// In TEXT type data should be null-terminated, but ClickHouse FixedString data could not be. + /// To make data null-terminated we should copy it to temporary String object and use it in capnp::Text::Reader. + /// Note that capnp::Text::Reader works only with pointer to the data and it's size, so we should + /// guarantee that new String object life time is longer than capnp::Text::Reader life time. + tmp_string = data.toString(); + return Reader(tmp_string.data(), tmp_string.size()); + } + } + + void insertData(IColumn & column, Reader data) + { + auto & fixed_string_column = assert_cast(column); + if (data.size() > fixed_string_column.getN()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot read data with size {} to FixedString with size {}", data.size(), fixed_string_column.getN()); + + fixed_string_column.insertData(reinterpret_cast(data.begin()), data.size()); + } + + String tmp_string; + capnp::Type capnp_type; + }; + std::unique_ptr createSerializer(const DataTypePtr & type, const String & name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings); class CapnProtoLowCardinalitySerializer : public ICapnProtoSerializer @@ -843,37 +789,43 @@ namespace void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - const auto & low_cardinality_column = assert_cast(*column); - size_t index = low_cardinality_column.getIndexAt(row_num); - const auto & dict_column = low_cardinality_column.getDictionary().getNestedColumn(); - nested_serializer->writeRow(dict_column, field_builder, parent_struct_builder, slot_offset, index); + writeRowImpl(column, field_builder, parent_struct_builder, slot_offset, row_num); } void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - const auto & low_cardinality_column = assert_cast(*column); - size_t index = low_cardinality_column.getIndexAt(row_num); - const auto & dict_column = low_cardinality_column.getDictionary().getNestedColumn(); - nested_serializer->writeRow(dict_column, field_builder, parent_list_builder, array_index, index); + writeRowImpl(column, field_builder, parent_list_builder, array_index, row_num); } void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { - auto & low_cardinality_column = assert_cast(column); - auto tmp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); - nested_serializer->readRow(*tmp_column, parent_struct_reader, slot_offset); - low_cardinality_column.insertFromFullColumn(*tmp_column, 0); + readRowImpl(column, parent_struct_reader, slot_offset); } void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override { - auto & low_cardinality_column = assert_cast(column); - auto tmp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); - nested_serializer->readRow(*tmp_column, parent_list_reader, array_index); - low_cardinality_column.insertFromFullColumn(*tmp_column, 0); + readRowImpl(column, parent_list_reader, array_index); } private: + template + void writeRowImpl(const ColumnPtr & column, std::unique_ptr & field_builder, ParentBuilder & parent_builder, UInt32 offset_or_index, size_t row_num) + { + const auto & low_cardinality_column = assert_cast(*column); + size_t index = low_cardinality_column.getIndexAt(row_num); + const auto & dict_column = low_cardinality_column.getDictionary().getNestedColumn(); + nested_serializer->writeRow(dict_column, field_builder, parent_builder, offset_or_index, index); + } + + template + void readRowImpl(IColumn & column, const ParentReader & parent_reader, UInt32 offset_or_index) + { + auto & low_cardinality_column = assert_cast(column); + auto tmp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); + nested_serializer->readRow(*tmp_column, parent_reader, offset_or_index); + low_cardinality_column.insertFromFullColumn(*tmp_column, 0); + } + std::unique_ptr nested_serializer; }; @@ -938,38 +890,32 @@ namespace void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - if (!field_builder) - { - auto builder_impl = parent_struct_builder.getBuilderImpl(); - auto struct_builder_impl = capnp::DynamicStruct::Builder(struct_schema, builder_impl.getPointerField(slot_offset).initStruct(struct_size)); - field_builder = std::make_unique(std::move(struct_builder_impl), 1); - } - - auto & struct_builder = assert_cast(*field_builder); - - const auto & nullable_column = assert_cast(*column); - if (nullable_column.isNullAt(row_num)) - { - auto struct_builder_impl = struct_builder.impl.getBuilderImpl(); - struct_builder_impl.setDataField(discriminant_offset, null_discriminant); - struct_builder_impl.setDataField(nested_slot_offset, capnp::Void()); - } - else - { - const auto & nested_column = nullable_column.getNestedColumnPtr(); - struct_builder.impl.getBuilderImpl().setDataField(discriminant_offset, nested_discriminant); - nested_serializer->writeRow(nested_column, struct_builder.field_builders[0], struct_builder.impl, nested_slot_offset, row_num); - } + writeRowImpl(column, field_builder, parent_struct_builder, slot_offset, row_num); } void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + writeRowImpl(column, field_builder, parent_list_builder, array_index, row_num); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, parent_struct_reader.getReaderImpl().getPointerField(slot_offset).getStruct(nullptr)); + readRowImpl(column, struct_reader); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, parent_list_reader.getReaderImpl().getStructElement(array_index)); + readRowImpl(column, struct_reader); + } + + private: + template + void writeRowImpl(const ColumnPtr & column, std::unique_ptr & field_builder, ParentBuilder & parent_builder, UInt32 offset_or_index, size_t row_num) { if (!field_builder) - { - auto builder_impl = parent_list_builder.getBuilderImpl(); - auto struct_builder_impl = capnp::DynamicStruct::Builder(struct_schema, builder_impl.getStructElement(array_index)); - field_builder = std::make_unique(std::move(struct_builder_impl), 1); - } + field_builder = initStructBuilder(parent_builder, offset_or_index, struct_size, 1, struct_schema); auto & struct_builder = assert_cast(*field_builder); @@ -988,12 +934,9 @@ namespace } } - void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + void readRowImpl(IColumn & column, capnp::DynamicStruct::Reader & struct_reader) { auto & nullable_column = assert_cast(column); - auto reader_impl = parent_struct_reader.getReaderImpl(); - auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, reader_impl.getPointerField(slot_offset).getStruct(nullptr)); - auto discriminant = struct_reader.getReaderImpl().getDataField(discriminant_offset); if (discriminant == null_discriminant) @@ -1006,25 +949,7 @@ namespace } } - void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override - { - auto & nullable_column = assert_cast(column); - auto reader_impl = parent_list_reader.getReaderImpl(); - auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, reader_impl.getStructElement(array_index)); - auto discriminant = struct_reader.getReaderImpl().getDataField(discriminant_offset); - - if (discriminant == null_discriminant) - nullable_column.insertDefault(); - else - { - auto & nested_column = nullable_column.getNestedColumn(); - nested_serializer->readRow(nested_column, struct_reader, nested_slot_offset); - nullable_column.getNullMapData().push_back(0); - } - } - - private: std::unique_ptr nested_serializer; capnp::StructSchema struct_schema; capnp::_::StructSize struct_size; @@ -1058,29 +983,29 @@ namespace void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - const auto * array_column = assert_cast(column.get()); - const auto & nested_column = array_column->getDataPtr(); - const auto & offsets = array_column->getOffsets(); - auto offset = offsets[row_num - 1]; - UInt32 size = static_cast(offsets[row_num] - offset); - - if (!field_builder) - { - auto builder_impl = parent_struct_builder.getBuilderImpl(); - capnp::DynamicList::Builder list_builder_impl; - if (element_is_struct) - list_builder_impl = capnp::DynamicList::Builder(list_schema, builder_impl.getPointerField(slot_offset).initStructList(size, element_struct_size)); - else - list_builder_impl = capnp::DynamicList::Builder(list_schema, builder_impl.getPointerField(slot_offset).initList(element_size, size)); - field_builder = std::make_unique(std::move(list_builder_impl), size); - } - - auto & list_builder = assert_cast(*field_builder); - for (UInt32 i = 0; i != size; ++i) - nested_serializer->writeRow(nested_column, list_builder.nested_builders[i], list_builder.impl, i, offset + i); + writeRowImpl(column, field_builder, parent_struct_builder, slot_offset, row_num); } void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + writeRowImpl(column, field_builder, parent_list_builder, array_index, row_num); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + auto list_reader = capnp::DynamicList::Reader(list_schema, parent_struct_reader.getReaderImpl().getPointerField(slot_offset).getList(element_size, nullptr)); + readRowImpl(column, list_reader); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + auto list_reader = capnp::DynamicList::Reader(list_schema, parent_list_reader.getReaderImpl().getPointerElement(array_index).getList(element_size, nullptr)); + readRowImpl(column, list_reader); + } + + private: + template + void writeRowImpl(const ColumnPtr & column, std::unique_ptr & field_builder, ParentBuilder & parent_builder, UInt32 offset_or_index, size_t row_num) { const auto * array_column = assert_cast(column.get()); const auto & nested_column = array_column->getDataPtr(); @@ -1089,25 +1014,32 @@ namespace UInt32 size = static_cast(offsets[row_num] - offset); if (!field_builder) - { - auto builder_impl = parent_list_builder.getBuilderImpl(); - capnp::DynamicList::Builder list_builder_impl; - if (element_is_struct) - list_builder_impl = capnp::DynamicList::Builder(list_schema, builder_impl.getPointerElement(array_index).initStructList(size, element_struct_size)); - else - list_builder_impl = capnp::DynamicList::Builder(list_schema, builder_impl.getPointerElement(array_index).initList(element_size, size)); - field_builder = std::make_unique(std::move(list_builder_impl), size); - } + field_builder = std::make_unique(capnp::DynamicList::Builder(list_schema, initListBuilder(parent_builder, offset_or_index, size)), size); auto & list_builder = assert_cast(*field_builder); for (UInt32 i = 0; i != size; ++i) nested_serializer->writeRow(nested_column, list_builder.nested_builders[i], list_builder.impl, i, offset + i); } - void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + template + capnp::_::ListBuilder initListBuilder(ParentBuilder & parent_builder, UInt32 offset_or_index, UInt32 size) + { + if (element_is_struct) + { + if constexpr (std::is_same_v) + return parent_builder.getBuilderImpl().getPointerField(offset_or_index).initStructList(size, element_struct_size); + else + return parent_builder.getBuilderImpl().getPointerElement(offset_or_index).initStructList(size, element_struct_size); + } + + if constexpr (std::is_same_v) + return parent_builder.getBuilderImpl().getPointerField(offset_or_index).initList(element_size, size); + else + return parent_builder.getBuilderImpl().getPointerElement(offset_or_index).initList(element_size, size); + } + + void readRowImpl(IColumn & column, const capnp::DynamicList::Reader & list_reader) { - const auto & reader_impl = parent_struct_reader.getReaderImpl(); - auto list_reader = capnp::DynamicList::Reader(list_schema, reader_impl.getPointerField(slot_offset).getList(element_size, nullptr)); UInt32 size = list_reader.size(); auto & column_array = assert_cast(column); auto & offsets = column_array.getOffsets(); @@ -1118,21 +1050,6 @@ namespace nested_serializer->readRow(nested_column, list_reader, i); } - void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override - { - const auto & reader_impl = parent_list_reader.getReaderImpl(); - auto list_reader = capnp::DynamicList::Reader(list_schema, reader_impl.getPointerElement(array_index).getList(element_size, nullptr)); - UInt32 size = list_reader.size(); - auto & column_array = assert_cast(column); - auto & offsets = column_array.getOffsets(); - offsets.push_back(offsets.back() + list_reader.size()); - - auto & nested_column = column_array.getData(); - for (UInt32 i = 0; i != size; ++i) - nested_serializer->readRow(nested_column, list_reader, i); - } - - private: capnp::ListSchema list_schema; std::unique_ptr nested_serializer; capnp::ElementSize element_size; @@ -1219,49 +1136,44 @@ namespace void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - if (!field_builder) - { - auto builder_impl = parent_struct_builder.getBuilderImpl(); - auto struct_builder_impl = capnp::DynamicStruct::Builder(struct_schema, builder_impl.getPointerField(slot_offset).initStruct(struct_size)); - field_builder = std::make_unique(std::move(struct_builder_impl), 1); - } - - auto & struct_builder = assert_cast(*field_builder); - const auto & entries_column = assert_cast(column.get())->getNestedColumnPtr(); - nested_serializer->writeRow(entries_column, struct_builder.field_builders[0], struct_builder.impl, entries_slot_offset, row_num); + writeRowImpl(column, field_builder, parent_struct_builder, slot_offset, row_num); } void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + writeRowImpl(column, field_builder, parent_list_builder, array_index, row_num); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, parent_struct_reader.getReaderImpl().getPointerField(slot_offset).getStruct(nullptr)); + readRowImpl(column, struct_reader); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, parent_list_reader.getReaderImpl().getStructElement(array_index)); + readRowImpl(column, struct_reader); + } + + private: + template + void writeRowImpl(const ColumnPtr & column, std::unique_ptr & field_builder, ParentBuilder & parent_builder, UInt32 offset_or_index, size_t row_num) { if (!field_builder) - { - auto builder_impl = parent_list_builder.getBuilderImpl(); - auto struct_builder_impl = capnp::DynamicStruct::Builder(struct_schema, builder_impl.getStructElement(array_index)); - field_builder = std::make_unique(std::move(struct_builder_impl), 1); - } + field_builder = initStructBuilder(parent_builder, offset_or_index, struct_size, 1, struct_schema); auto & struct_builder = assert_cast(*field_builder); const auto & entries_column = assert_cast(column.get())->getNestedColumnPtr(); nested_serializer->writeRow(entries_column, struct_builder.field_builders[0], struct_builder.impl, entries_slot_offset, row_num); } - void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + void readRowImpl(IColumn & column, const capnp::DynamicStruct::Reader & struct_reader) { - auto reader_impl = parent_struct_reader.getReaderImpl(); - auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, reader_impl.getPointerField(slot_offset).getStruct(nullptr)); auto & entries_column = assert_cast(column).getNestedColumn(); nested_serializer->readRow(entries_column, struct_reader, entries_slot_offset); } - void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override - { - auto reader_impl = parent_list_reader.getReaderImpl(); - auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, reader_impl.getStructElement(array_index)); - auto & entries_column = assert_cast(column).getNestedColumn(); - nested_serializer->readRow(entries_column, struct_reader, entries_slot_offset); - } - - private: std::unique_ptr nested_serializer; capnp::StructSchema struct_schema; capnp::_::StructSize struct_size; @@ -1332,48 +1244,15 @@ namespace void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override { - if (!field_builder) - { - auto builder_impl = parent_struct_builder.getBuilderImpl(); - auto struct_builder_impl = capnp::DynamicStruct::Builder(struct_schema, builder_impl.getPointerField(slot_offset).initStruct(struct_size)); - field_builder = std::make_unique(std::move(struct_builder_impl), fields_count); - } - - auto & struct_builder = assert_cast(*field_builder); - if (const auto * tuple_column = typeid_cast(column.get())) - { - const auto & columns = tuple_column->getColumns(); - for (size_t i = 0; i != columns.size(); ++i) - fields_serializers[i]->writeRow(columns[i], struct_builder.field_builders[fields_indexes[i]], struct_builder.impl, fields_offsets[i], row_num); - } - else - { - fields_serializers[0]->writeRow(column, struct_builder.field_builders[fields_indexes[0]], struct_builder.impl, fields_offsets[0], row_num); - } + writeRowImpl(column, field_builder, parent_struct_builder, slot_offset, row_num); } void writeRow(const ColumnPtr & column, std::unique_ptr & field_builder, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { - if (!field_builder) - { - auto builder_impl = parent_list_builder.getBuilderImpl(); - auto struct_builder_impl = capnp::DynamicStruct::Builder(struct_schema, builder_impl.getStructElement(array_index)); - field_builder = std::make_unique(std::move(struct_builder_impl), fields_count); - } - - auto & struct_builder = assert_cast(*field_builder); - if (const auto * tuple_column = typeid_cast(column.get())) - { - const auto & columns = tuple_column->getColumns(); - for (size_t i = 0; i != columns.size(); ++i) - fields_serializers[i]->writeRow(columns[i], struct_builder.field_builders[fields_indexes[i]], struct_builder.impl, fields_offsets[i], row_num); - } - else - { - fields_serializers[0]->writeRow(column, struct_builder.field_builders[fields_indexes[0]], struct_builder.impl, fields_offsets[0], row_num); - } + writeRowImpl(column, field_builder, parent_list_builder, array_index, row_num); } + /// Method for writing root struct. void writeRow(const Columns & columns, StructBuilder & struct_builder, size_t row_num) { for (size_t i = 0; i != columns.size(); ++i) @@ -1382,30 +1261,17 @@ namespace void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override { - auto reader_impl = parent_struct_reader.getReaderImpl(); - auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, reader_impl.getPointerField(slot_offset).getStruct(nullptr)); - if (auto * tuple_column = typeid_cast(&column)) - { - for (size_t i = 0; i != tuple_column->tupleSize(); ++i) - fields_serializers[i]->readRow(tuple_column->getColumn(i), struct_reader, fields_offsets[i]); - } - else - fields_serializers[0]->readRow(column, struct_reader, fields_offsets[0]); + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, parent_struct_reader.getReaderImpl().getPointerField(slot_offset).getStruct(nullptr)); + readRowImpl(column, struct_reader); } void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override { - auto reader_impl = parent_list_reader.getReaderImpl(); - auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, reader_impl.getStructElement(array_index)); - if (auto * tuple_column = typeid_cast(&column)) - { - for (size_t i = 0; i != tuple_column->tupleSize(); ++i) - fields_serializers[i]->readRow(tuple_column->getColumn(i), struct_reader, fields_offsets[i]); - } - else - fields_serializers[0]->readRow(column, struct_reader, fields_offsets[0]); + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, parent_list_reader.getReaderImpl().getStructElement(array_index)); + readRowImpl(column, struct_reader); } + /// Method for reading from root struct. void readRow(MutableColumns & columns, const capnp::DynamicStruct::Reader & reader) { for (size_t i = 0; i != columns.size(); ++i) @@ -1435,6 +1301,36 @@ namespace } } + template + void writeRowImpl(const ColumnPtr & column, std::unique_ptr & field_builder, ParentBuilder & parent_builder, UInt32 offset_or_index, size_t row_num) + { + if (!field_builder) + field_builder = initStructBuilder(parent_builder, offset_or_index, struct_size, fields_count, struct_schema); + + auto & struct_builder = assert_cast(*field_builder); + if (const auto * tuple_column = typeid_cast(column.get())) + { + const auto & columns = tuple_column->getColumns(); + for (size_t i = 0; i != columns.size(); ++i) + fields_serializers[i]->writeRow(columns[i], struct_builder.field_builders[fields_indexes[i]], struct_builder.impl, fields_offsets[i], row_num); + } + else + { + fields_serializers[0]->writeRow(column, struct_builder.field_builders[fields_indexes[0]], struct_builder.impl, fields_offsets[0], row_num); + } + } + + void readRowImpl(IColumn & column, const capnp::DynamicStruct::Reader & struct_reader) + { + if (auto * tuple_column = typeid_cast(&column)) + { + for (size_t i = 0; i != tuple_column->tupleSize(); ++i) + fields_serializers[i]->readRow(tuple_column->getColumn(i), struct_reader, fields_offsets[i]); + } + else + fields_serializers[0]->readRow(column, struct_reader, fields_offsets[0]); + } + capnp::StructSchema struct_schema; capnp::_::StructSize struct_size; size_t fields_count; @@ -1515,12 +1411,12 @@ namespace return std::make_unique>(type, name, capnp_type, settings.enum_comparing_mode); case TypeIndex::String: if (capnp_type.isData()) - return std::make_unique>(type, name, capnp_type); - return std::make_unique>(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); case TypeIndex::FixedString: if (capnp_type.isData()) - return std::make_unique>(type, name, capnp_type); - return std::make_unique>(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); + return std::make_unique>(type, name, capnp_type); case TypeIndex::LowCardinality: return std::make_unique(type, name, capnp_type, settings); case TypeIndex::Nullable: diff --git a/src/Storages/addColumnsStructureToQueryWithClusterEngine.cpp b/src/Storages/addColumnsStructureToQueryWithClusterEngine.cpp new file mode 100644 index 00000000000..106161ae620 --- /dev/null +++ b/src/Storages/addColumnsStructureToQueryWithClusterEngine.cpp @@ -0,0 +1,52 @@ +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +ASTExpressionList * extractTableFunctionArgumentsFromSelectQuery(ASTPtr & query) +{ + auto * select_query = query->as(); + if (!select_query || !select_query->tables()) + return nullptr; + + auto * tables = select_query->tables()->as(); + auto * table_expression = tables->children[0]->as()->table_expression->as(); + if (!table_expression->table_function) + return nullptr; + + auto * table_function = table_expression->table_function->as(); + return table_function->arguments->as(); +} + +void addColumnsStructureToQueryWithClusterEngine(ASTPtr & query, const String & structure, size_t max_arguments, const String & function_name) +{ + ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); + if (!expression_list) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function {}, got '{}'", function_name, queryToString(query)); + auto structure_literal = std::make_shared(structure); + + if (expression_list->children.size() < 2 || expression_list->children.size() > max_arguments) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected 2 to {} arguments in {} table functions, got {}", + function_name, max_arguments, expression_list->children.size()); + + if (expression_list->children.size() == 2 || expression_list->children.size() == max_arguments - 1) + { + auto format_literal = std::make_shared("auto"); + expression_list->children.push_back(format_literal); + } + + expression_list->children.push_back(structure_literal); +} + +} diff --git a/src/Storages/addColumnsStructureToQueryWithClusterEngine.h b/src/Storages/addColumnsStructureToQueryWithClusterEngine.h new file mode 100644 index 00000000000..5939f3f43aa --- /dev/null +++ b/src/Storages/addColumnsStructureToQueryWithClusterEngine.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include + +namespace DB +{ + +ASTExpressionList * extractTableFunctionArgumentsFromSelectQuery(ASTPtr & query); + +/// Add structure argument for queries with s3Cluster/hdfsCluster table function. +void addColumnsStructureToQueryWithClusterEngine(ASTPtr & query, const String & structure, size_t max_arguments, const String & function_name); + +} From f2e076a4431e1515d46b039ef248ddd5fd33de81 Mon Sep 17 00:00:00 2001 From: ltrk2 <107155950+ltrk2@users.noreply.github.com> Date: Wed, 31 May 2023 19:33:32 +0000 Subject: [PATCH 0198/1072] Implement big-endian support for the deterministic reservoir sampler --- .../ReservoirSamplerDeterministic.h | 14 ++--- src/Common/TransformEndianness.hpp | 54 +++++++++++++++++++ src/IO/ReadHelpers.h | 22 +------- src/IO/WriteHelpers.h | 22 +------- 4 files changed, 65 insertions(+), 47 deletions(-) create mode 100644 src/Common/TransformEndianness.hpp diff --git a/src/AggregateFunctions/ReservoirSamplerDeterministic.h b/src/AggregateFunctions/ReservoirSamplerDeterministic.h index bde33260f5a..9dea821e839 100644 --- a/src/AggregateFunctions/ReservoirSamplerDeterministic.h +++ b/src/AggregateFunctions/ReservoirSamplerDeterministic.h @@ -157,8 +157,8 @@ public: void read(DB::ReadBuffer & buf) { size_t size = 0; - DB::readIntBinary(size, buf); - DB::readIntBinary(total_values, buf); + readBinaryLittleEndian(size, buf); + readBinaryLittleEndian(total_values, buf); /// Compatibility with old versions. if (size > total_values) @@ -171,16 +171,16 @@ public: samples.resize(size); for (size_t i = 0; i < size; ++i) - DB::readPODBinary(samples[i], buf); + readBinaryLittleEndian(samples[i], buf); sorted = false; } void write(DB::WriteBuffer & buf) const { - size_t size = samples.size(); - DB::writeIntBinary(size, buf); - DB::writeIntBinary(total_values, buf); + const auto size = samples.size(); + writeBinaryLittleEndian(size, buf); + writeBinaryLittleEndian(total_values, buf); for (size_t i = 0; i < size; ++i) { @@ -195,7 +195,7 @@ public: memset(&elem, 0, sizeof(elem)); elem = samples[i]; - DB::writePODBinary(elem, buf); + writeBinaryLittleEndian(elem, buf); } } diff --git a/src/Common/TransformEndianness.hpp b/src/Common/TransformEndianness.hpp new file mode 100644 index 00000000000..17cf441d17f --- /dev/null +++ b/src/Common/TransformEndianness.hpp @@ -0,0 +1,54 @@ +#pragma once + +#include +#include + +#include + +namespace DB +{ +template + requires is_big_int_v +inline void transformEndianness(T & x) +{ + if constexpr (std::endian::native != endian) + { + std::ranges::transform(x.items, std::begin(x.items), [](auto& item) { return std::byteswap(item); }); + std::ranges::reverse(x.items); + } +} + +template + requires is_decimal || std::is_floating_point_v +inline void transformEndianness(T & value) +{ + if constexpr (std::endian::native != endian) + { + auto * start = reinterpret_cast(&value); + std::reverse(start, start + sizeof(T)); + } +} + +template + requires std::is_integral_v && (sizeof(T) <= 8) +inline void transformEndianness(T & value) +{ + if constexpr (endian != std::endian::native) + value = std::byteswap(value); +} + +template + requires std::is_scoped_enum_v +inline void transformEndianness(T & x) +{ + using UnderlyingType = std::underlying_type_t; + transformEndianness(reinterpret_cast(x)); +} + +template +inline void transformEndianness(std::pair & pair) +{ + transformEndianness(pair.first); + transformEndianness(pair.second); +} +} diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 32338552b66..c42e992c807 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1098,30 +1099,11 @@ inline void readBinary(Decimal128 & x, ReadBuffer & buf) { readPODBinary(x, buf) inline void readBinary(Decimal256 & x, ReadBuffer & buf) { readPODBinary(x.value, buf); } inline void readBinary(LocalDate & x, ReadBuffer & buf) { readPODBinary(x, buf); } - template -requires is_arithmetic_v && (sizeof(T) <= 8) inline void readBinaryEndian(T & x, ReadBuffer & buf) { readPODBinary(x, buf); - if constexpr (std::endian::native != endian) - x = std::byteswap(x); -} - -template -requires is_big_int_v -inline void readBinaryEndian(T & x, ReadBuffer & buf) -{ - if constexpr (std::endian::native == endian) - { - for (size_t i = 0; i != std::size(x.items); ++i) - readBinaryEndian(x.items[i], buf); - } - else - { - for (size_t i = 0; i != std::size(x.items); ++i) - readBinaryEndian(x.items[std::size(x.items) - i - 1], buf); - } + transformEndianness(x); } template diff --git a/src/IO/WriteHelpers.h b/src/IO/WriteHelpers.h index cdbc952690c..26c999cb761 100644 --- a/src/IO/WriteHelpers.h +++ b/src/IO/WriteHelpers.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -1172,32 +1173,13 @@ inline void writeNullTerminatedString(const String & s, WriteBuffer & buffer) buffer.write(s.c_str(), s.size() + 1); } - template -requires is_arithmetic_v && (sizeof(T) <= 8) inline void writeBinaryEndian(T x, WriteBuffer & buf) { - if constexpr (std::endian::native != endian) - x = std::byteswap(x); + transformEndianness(x); writePODBinary(x, buf); } -template -requires is_big_int_v -inline void writeBinaryEndian(const T & x, WriteBuffer & buf) -{ - if constexpr (std::endian::native == endian) - { - for (size_t i = 0; i != std::size(x.items); ++i) - writeBinaryEndian(x.items[i], buf); - } - else - { - for (size_t i = 0; i != std::size(x.items); ++i) - writeBinaryEndian(x.items[std::size(x.items) - i - 1], buf); - } -} - template inline void writeBinaryLittleEndian(T x, WriteBuffer & buf) { From 4b46486491a80ab7a0f2bd62d6c6e4fb606aa429 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 31 May 2023 21:52:58 +0200 Subject: [PATCH 0199/1072] Clean up --- ...lumnsStructureToQueryWithClusterEngine.cpp | 52 ------------------- 1 file changed, 52 deletions(-) delete mode 100644 src/Storages/addColumnsStructureToQueryWithClusterEngine.cpp diff --git a/src/Storages/addColumnsStructureToQueryWithClusterEngine.cpp b/src/Storages/addColumnsStructureToQueryWithClusterEngine.cpp deleted file mode 100644 index 106161ae620..00000000000 --- a/src/Storages/addColumnsStructureToQueryWithClusterEngine.cpp +++ /dev/null @@ -1,52 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - -ASTExpressionList * extractTableFunctionArgumentsFromSelectQuery(ASTPtr & query) -{ - auto * select_query = query->as(); - if (!select_query || !select_query->tables()) - return nullptr; - - auto * tables = select_query->tables()->as(); - auto * table_expression = tables->children[0]->as()->table_expression->as(); - if (!table_expression->table_function) - return nullptr; - - auto * table_function = table_expression->table_function->as(); - return table_function->arguments->as(); -} - -void addColumnsStructureToQueryWithClusterEngine(ASTPtr & query, const String & structure, size_t max_arguments, const String & function_name) -{ - ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); - if (!expression_list) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function {}, got '{}'", function_name, queryToString(query)); - auto structure_literal = std::make_shared(structure); - - if (expression_list->children.size() < 2 || expression_list->children.size() > max_arguments) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected 2 to {} arguments in {} table functions, got {}", - function_name, max_arguments, expression_list->children.size()); - - if (expression_list->children.size() == 2 || expression_list->children.size() == max_arguments - 1) - { - auto format_literal = std::make_shared("auto"); - expression_list->children.push_back(format_literal); - } - - expression_list->children.push_back(structure_literal); -} - -} From 4987c4baaa0c10ed1114155a0d35db3953f34ab9 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 31 May 2023 21:53:21 +0200 Subject: [PATCH 0200/1072] Clean up --- .../addColumnsStructureToQueryWithClusterEngine.h | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 src/Storages/addColumnsStructureToQueryWithClusterEngine.h diff --git a/src/Storages/addColumnsStructureToQueryWithClusterEngine.h b/src/Storages/addColumnsStructureToQueryWithClusterEngine.h deleted file mode 100644 index 5939f3f43aa..00000000000 --- a/src/Storages/addColumnsStructureToQueryWithClusterEngine.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include -#include - -namespace DB -{ - -ASTExpressionList * extractTableFunctionArgumentsFromSelectQuery(ASTPtr & query); - -/// Add structure argument for queries with s3Cluster/hdfsCluster table function. -void addColumnsStructureToQueryWithClusterEngine(ASTPtr & query, const String & structure, size_t max_arguments, const String & function_name); - -} From da09823ecebe67d2d6983ac58ee38fe51d5d9e7f Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 31 May 2023 23:14:39 +0300 Subject: [PATCH 0201/1072] Compile aggregate expressions enable by default --- src/Core/Settings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 607be1522db..6fed70a9303 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -135,7 +135,7 @@ class IColumn; M(Bool, allow_suspicious_indices, false, "Reject primary/secondary indexes and sorting keys with identical expressions", 0) \ M(Bool, compile_expressions, true, "Compile some scalar functions and operators to native code.", 0) \ M(UInt64, min_count_to_compile_expression, 3, "The number of identical expressions before they are JIT-compiled", 0) \ - M(Bool, compile_aggregate_expressions, false, "Compile aggregate functions to native code. This feature has a bug and should not be used.", 0) \ + M(Bool, compile_aggregate_expressions, true, "Compile aggregate functions to native code. This feature has a bug and should not be used.", 0) \ M(UInt64, min_count_to_compile_aggregate_expression, 3, "The number of identical aggregate expressions before they are JIT-compiled", 0) \ M(Bool, compile_sort_description, true, "Compile sort description to native code.", 0) \ M(UInt64, min_count_to_compile_sort_description, 3, "The number of identical sort descriptions before they are JIT-compiled", 0) \ From 8b34a30455fd42928e5c89c503a25cf5d02ccff8 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 31 May 2023 22:14:57 +0200 Subject: [PATCH 0202/1072] Fix style --- src/Formats/CapnProtoSerializer.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp index e36f5fa4947..e99db23bb5e 100644 --- a/src/Formats/CapnProtoSerializer.cpp +++ b/src/Formats/CapnProtoSerializer.cpp @@ -29,7 +29,6 @@ namespace DB namespace ErrorCodes { extern const int THERE_IS_NO_COLUMN; - extern const int LOGICAL_ERROR; extern const int CAPN_PROTO_BAD_CAST; extern const int INCORRECT_DATA; extern const int ILLEGAL_COLUMN; @@ -371,7 +370,7 @@ namespace { parent_struct_builder.getBuilderImpl().setDataField(slot_offset, getValue(column, row_num)); } - + void writeRow(const ColumnPtr & column, std::unique_ptr &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override { parent_list_builder.getBuilderImpl().setDataElement(array_index, getValue(column, row_num)); From 4c92bc7aadf354b713d6b8f3c24728d6172f1867 Mon Sep 17 00:00:00 2001 From: tpanetti Date: Wed, 31 May 2023 15:30:26 -0700 Subject: [PATCH 0203/1072] Fix incompatible ClickHouse -> MySQL types for compability mode This adjusts specific incompatible ClickHouse types to a format that can be read and interpreted by MySQL (Ex: Int128 -> text) --- src/DataTypes/DataTypeArray.h | 2 +- src/DataTypes/DataTypeLowCardinality.cpp | 3 +- src/DataTypes/DataTypeLowCardinality.h | 4 +- src/DataTypes/DataTypeNumberBase.cpp | 8 +- src/DataTypes/DataTypeString.h | 2 +- ...show_columns_mysql_compatibility.reference | 229 ++++++++++++++++++ ...02775_show_columns_mysql_compatibility.sh} | 23 +- 7 files changed, 256 insertions(+), 15 deletions(-) create mode 100644 tests/queries/0_stateless/02775_show_columns_mysql_compatibility.reference rename tests/queries/0_stateless/{02740_show_columns_mysql_compatibility.sh => 02775_show_columns_mysql_compatibility.sh} (80%) diff --git a/src/DataTypes/DataTypeArray.h b/src/DataTypes/DataTypeArray.h index 35462df9a4e..b031f411975 100644 --- a/src/DataTypes/DataTypeArray.h +++ b/src/DataTypes/DataTypeArray.h @@ -32,7 +32,7 @@ public: } const char * getMySQLName() const override { - return "string"; + return "text"; } bool canBeInsideNullable() const override diff --git a/src/DataTypes/DataTypeLowCardinality.cpp b/src/DataTypes/DataTypeLowCardinality.cpp index 8293455cabc..b1c32317015 100644 --- a/src/DataTypes/DataTypeLowCardinality.cpp +++ b/src/DataTypes/DataTypeLowCardinality.cpp @@ -28,7 +28,8 @@ namespace ErrorCodes } DataTypeLowCardinality::DataTypeLowCardinality(DataTypePtr dictionary_type_) - : dictionary_type(std::move(dictionary_type_)) + : dictionary_type(std::move(dictionary_type_)), + mysql_name(dictionary_type->getMySQLName()) { auto inner_type = dictionary_type; if (dictionary_type->isNullable()) diff --git a/src/DataTypes/DataTypeLowCardinality.h b/src/DataTypes/DataTypeLowCardinality.h index 6fd4344311c..bcc39f58ff7 100644 --- a/src/DataTypes/DataTypeLowCardinality.h +++ b/src/DataTypes/DataTypeLowCardinality.h @@ -11,6 +11,8 @@ class DataTypeLowCardinality : public IDataType { private: DataTypePtr dictionary_type; + std::string mysql_name; + public: explicit DataTypeLowCardinality(DataTypePtr dictionary_type_); @@ -22,7 +24,7 @@ public: return "LowCardinality(" + dictionary_type->getName() + ")"; } const char * getFamilyName() const override { return "LowCardinality"; } - const char * getMySQLName() const override { return "text"; } + const char * getMySQLName() const override { return mysql_name.c_str(); } TypeIndex getTypeId() const override { return TypeIndex::LowCardinality; } diff --git a/src/DataTypes/DataTypeNumberBase.cpp b/src/DataTypes/DataTypeNumberBase.cpp index cd5e73ac4a1..7d200de7996 100644 --- a/src/DataTypes/DataTypeNumberBase.cpp +++ b/src/DataTypes/DataTypeNumberBase.cpp @@ -36,14 +36,14 @@ const std::map DataTypeNumberBase::mysqlTypeMap = { {"UInt16", "smallint unsigned"}, {"UInt32", "mediumint unsigned"}, {"UInt64", "bigint unsigned"}, - {"UInt128", "bigint unsigned"}, - {"UInt256", "bigint unsigned"}, + {"UInt128", "text"}, + {"UInt256", "text"}, {"Int8", "tinyint"}, {"Int16", "smallint"}, {"Int32", "int"}, {"Int64", "bigint"}, - {"Int128", "bigint"}, - {"Int256", "bigint"}, + {"Int128", "text"}, + {"Int256", "text"}, {"Float32", "float"}, {"Float64", "double"}, }; diff --git a/src/DataTypes/DataTypeString.h b/src/DataTypes/DataTypeString.h index 3ac739fe68c..bddfb4ae287 100644 --- a/src/DataTypes/DataTypeString.h +++ b/src/DataTypes/DataTypeString.h @@ -22,7 +22,7 @@ public: } // FIXME: string can contain arbitrary bytes, not only UTF-8 sequences - const char * getMySQLName() const override { return "text"; } + const char * getMySQLName() const override { return "blob"; } TypeIndex getTypeId() const override { return type_id; } diff --git a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.reference b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.reference new file mode 100644 index 00000000000..96e542611c6 --- /dev/null +++ b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.reference @@ -0,0 +1,229 @@ +Drop tables if they exist +Create tab table +Create pseudo-random database name +Create tab duplicate table +Run MySQL test +field type null key default extra +aggregate_function text 0 NULL +array_value text 0 NULL +boolean_value tinyint unsigned 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value datetime 0 NULL +datetime_value datetime 0 NULL +decimal_value decimal 0 NULL +enum_value enum('apple', 'banana', 'orange') 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +int32 int 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value json 0 NULL +low_cardinality blob 0 NULL +low_cardinality_date datetime 0 NULL +map_value json 0 NULL +nested.nested_int text 0 NULL +nested.nested_string text 0 NULL +nullable_value int 0 NULL +string_value blob 0 NULL +tuple_value json 0 NULL +uint64 bigint unsigned 0 PRI SOR NULL +uuid_value char 0 NULL +field type null key default extra +aggregate_function text 0 NULL +array_value text 0 NULL +boolean_value tinyint unsigned 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value datetime 0 NULL +datetime_value datetime 0 NULL +decimal_value decimal 0 NULL +enum_value enum('apple', 'banana', 'orange') 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +int32 int 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value json 0 NULL +low_cardinality blob 0 NULL +low_cardinality_date datetime 0 NULL +map_value json 0 NULL +nested.nested_int text 0 NULL +nested.nested_string text 0 NULL +nullable_value int 0 NULL +string_value blob 0 NULL +tuple_value json 0 NULL +uint64 bigint unsigned 0 PRI SOR NULL +uuid_value char 0 NULL +field type null key default extra collation comment privileges +aggregate_function text 0 NULL NULL +array_value text 0 NULL NULL +boolean_value tinyint unsigned 0 NULL NULL +date32_value date 0 NULL NULL +date_value date 0 NULL NULL +datetime64_value datetime 0 NULL NULL +datetime_value datetime 0 NULL NULL +decimal_value decimal 0 NULL NULL +enum_value enum('apple', 'banana', 'orange') 0 NULL NULL +fixed_string_value text 0 NULL NULL +float32 float 0 NULL NULL +float64 double 0 NULL NULL +int32 int 0 NULL NULL +ipv4_value text 0 NULL NULL +ipv6_value text 0 NULL NULL +json_value json 0 NULL NULL +low_cardinality blob 0 NULL NULL +low_cardinality_date datetime 0 NULL NULL +map_value json 0 NULL NULL +nested.nested_int text 0 NULL NULL +nested.nested_string text 0 NULL NULL +nullable_value int 0 NULL NULL +string_value blob 0 NULL NULL +tuple_value json 0 NULL NULL +uint64 bigint unsigned 0 PRI SOR NULL NULL +uuid_value char 0 NULL NULL +field type null key default extra +int32 int 0 NULL +nested.nested_int text 0 NULL +uint64 bigint unsigned 0 PRI SOR NULL +field type null key default extra +aggregate_function text 0 NULL +array_value text 0 NULL +boolean_value tinyint unsigned 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value datetime 0 NULL +datetime_value datetime 0 NULL +decimal_value decimal 0 NULL +enum_value enum('apple', 'banana', 'orange') 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value json 0 NULL +low_cardinality blob 0 NULL +low_cardinality_date datetime 0 NULL +map_value json 0 NULL +nested.nested_string text 0 NULL +nullable_value int 0 NULL +string_value blob 0 NULL +tuple_value json 0 NULL +uuid_value char 0 NULL +field type null key default extra +int32 int 0 NULL +nested.nested_int text 0 NULL +uint64 bigint unsigned 0 PRI SOR NULL +field type null key default extra +aggregate_function text 0 NULL +array_value text 0 NULL +boolean_value tinyint unsigned 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value datetime 0 NULL +datetime_value datetime 0 NULL +decimal_value decimal 0 NULL +enum_value enum('apple', 'banana', 'orange') 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value json 0 NULL +low_cardinality blob 0 NULL +low_cardinality_date datetime 0 NULL +map_value json 0 NULL +nested.nested_string text 0 NULL +nullable_value int 0 NULL +string_value blob 0 NULL +tuple_value json 0 NULL +uuid_value char 0 NULL +field type null key default extra +int32 int 0 NULL +nested.nested_int text 0 NULL +uint64 bigint unsigned 0 PRI SOR NULL +field type null key default extra +aggregate_function text 0 NULL +field type null key default extra +aggregate_function text 0 NULL +array_value text 0 NULL +boolean_value tinyint unsigned 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value datetime 0 NULL +datetime_value datetime 0 NULL +decimal_value decimal 0 NULL +enum_value enum('apple', 'banana', 'orange') 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +int32 int 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value json 0 NULL +low_cardinality blob 0 NULL +low_cardinality_date datetime 0 NULL +map_value json 0 NULL +nested.nested_int text 0 NULL +nested.nested_string text 0 NULL +nullable_value int 0 NULL +string_value blob 0 NULL +tuple_value json 0 NULL +uint64 bigint unsigned 0 PRI SOR NULL +uuid_value char 0 NULL +field type null key default extra +aggregate_function text 0 NULL +array_value text 0 NULL +boolean_value tinyint unsigned 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value datetime 0 NULL +datetime_value datetime 0 NULL +decimal_value decimal 0 NULL +enum_value enum('apple', 'banana', 'orange') 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +int32 int 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value json 0 NULL +low_cardinality blob 0 NULL +low_cardinality_date datetime 0 NULL +map_value json 0 NULL +nested.nested_int text 0 NULL +nested.nested_string text 0 NULL +nullable_value int 0 NULL +string_value blob 0 NULL +tuple_value json 0 NULL +uint64 bigint unsigned 0 PRI SOR NULL +uuid_value char 0 NULL +field type null key default extra +aggregate_function text 0 NULL +array_value text 0 NULL +boolean_value tinyint unsigned 0 NULL +date32_value date 0 NULL +date_value date 0 NULL +datetime64_value datetime 0 NULL +datetime_value datetime 0 NULL +decimal_value decimal 0 NULL +enum_value enum('apple', 'banana', 'orange') 0 NULL +fixed_string_value text 0 NULL +float32 float 0 NULL +float64 double 0 NULL +int32 int 0 NULL +ipv4_value text 0 NULL +ipv6_value text 0 NULL +json_value json 0 NULL +low_cardinality blob 0 NULL +low_cardinality_date datetime 0 NULL +map_value json 0 NULL +nested.nested_int text 0 NULL +nested.nested_string text 0 NULL +nullable_value int 0 NULL +string_value blob 0 NULL +tuple_value json 0 NULL +uint64 bigint unsigned 0 PRI SOR NULL +uuid_value char 0 NULL diff --git a/tests/queries/0_stateless/02740_show_columns_mysql_compatibility.sh b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh similarity index 80% rename from tests/queries/0_stateless/02740_show_columns_mysql_compatibility.sh rename to tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh index 7f828d35679..a446c6e817e 100755 --- a/tests/queries/0_stateless/02740_show_columns_mysql_compatibility.sh +++ b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh @@ -13,8 +13,11 @@ ${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS tab" ${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS database_123456789abcde" ${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS database_123456789abcde.tab" +#${CLICKHOUSE_LOCAL} --query "SET allow_suspicious_low_cardinality_types = 1;" echo "Create tab table " -${CLICKHOUSE_LOCAL} --query " +${CLICKHOUSE_LOCAL} -n -q " + SET allow_suspicious_low_cardinality_types=1; + SET allow_experimental_object_type =1; CREATE TABLE tab ( uint64 UInt64, @@ -22,17 +25,19 @@ ${CLICKHOUSE_LOCAL} --query " float32 Float32, float64 Float64, decimal_value Decimal(10, 2), - boolean_value UInt8, -- Use 0 for false, 1 for true + boolean_value UInt8, string_value String, fixed_string_value FixedString(10), date_value Date, date32_value Date32, datetime_value DateTime, datetime64_value DateTime64(3), - json_value String, -- Store JSON as a string + json_value JSON, uuid_value UUID, enum_value Enum8('apple' = 1, 'banana' = 2, 'orange' = 3), low_cardinality LowCardinality(String), + low_cardinality_date LowCardinality(DateTime), + aggregate_function AggregateFunction(sum, Int32), array_value Array(Int32), map_value Map(String, Int32), tuple_value Tuple(Int32, String), @@ -53,7 +58,9 @@ echo "Create pseudo-random database name" ${CLICKHOUSE_LOCAL} --query "CREATE DATABASE database_123456789abcde;" echo "Create tab duplicate table" -${CLICKHOUSE_LOCAL} --query " +${CLICKHOUSE_LOCAL} -n -q " + SET allow_suspicious_low_cardinality_types=1; + SET allow_experimental_object_type =1; CREATE TABLE database_123456789abcde.tab ( uint64 UInt64, @@ -61,17 +68,19 @@ ${CLICKHOUSE_LOCAL} --query " float32 Float32, float64 Float64, decimal_value Decimal(10, 2), - boolean_value UInt8, -- Use 0 for false, 1 for true + boolean_value UInt8, string_value String, fixed_string_value FixedString(10), date_value Date, date32_value Date32, datetime_value DateTime, datetime64_value DateTime64(3), - json_value String, -- Store JSON as a string + json_value JSON, uuid_value UUID, enum_value Enum8('apple' = 1, 'banana' = 2, 'orange' = 3), low_cardinality LowCardinality(String), + low_cardinality_date LowCardinality(DateTime), + aggregate_function AggregateFunction(sum, Int32), array_value Array(Int32), map_value Map(String, Int32), tuple_value Tuple(Int32, String), @@ -109,7 +118,7 @@ EOT # Now run the MySQL test script on the ClickHouse DB echo "Run MySQL test" -mysql --user="$USER" --password="$PASSWORD" --host="$HOST" --port="$PORT" < $TEMP_FILE +${MYSQL_CLIENT} --user="$USER" --password="$PASSWORD" --host="$HOST" --port="$PORT" < $TEMP_FILE # Clean up the temp file rm $TEMP_FILE From 801a4d574294b76a7f660bdf2e698d534f861ab3 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Wed, 31 May 2023 20:08:22 -0300 Subject: [PATCH 0204/1072] test for #42610 --- ...10_view_dictionary_check_refresh.reference | 4 ++ .../01910_view_dictionary_check_refresh.sql | 54 +++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 tests/queries/0_stateless/01910_view_dictionary_check_refresh.reference create mode 100644 tests/queries/0_stateless/01910_view_dictionary_check_refresh.sql diff --git a/tests/queries/0_stateless/01910_view_dictionary_check_refresh.reference b/tests/queries/0_stateless/01910_view_dictionary_check_refresh.reference new file mode 100644 index 00000000000..c1be003ebef --- /dev/null +++ b/tests/queries/0_stateless/01910_view_dictionary_check_refresh.reference @@ -0,0 +1,4 @@ +view 1 2022-10-20 first +dict 1 2022-10-20 first +view 1 2022-10-21 second +dict 1 2022-10-21 second diff --git a/tests/queries/0_stateless/01910_view_dictionary_check_refresh.sql b/tests/queries/0_stateless/01910_view_dictionary_check_refresh.sql new file mode 100644 index 00000000000..b36a378d827 --- /dev/null +++ b/tests/queries/0_stateless/01910_view_dictionary_check_refresh.sql @@ -0,0 +1,54 @@ +-- Tags: long + +DROP DICTIONARY IF EXISTS TestTblDict; +DROP VIEW IF EXISTS TestTbl_view; +DROP TABLE IF EXISTS TestTbl; + +CREATE TABLE TestTbl +( + `id` UInt16, + `dt` Date, + `val` String +) +ENGINE = MergeTree +PARTITION BY dt +ORDER BY (id); + +CREATE VIEW TestTbl_view +AS +SELECT * +FROM TestTbl +WHERE dt = ( SELECT max(dt) FROM TestTbl ); + +CREATE DICTIONARY IF NOT EXISTS TestTblDict +( + `id` UInt16, + `dt` Date, + `val` String +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE TestTbl_view DB currentDatabase())) +LIFETIME(1) +LAYOUT(COMPLEX_KEY_HASHED()); + +select 'view' src,* FROM TestTbl_view; +select 'dict' src,* FROM TestTblDict ; + +insert into TestTbl values(1, '2022-10-20', 'first'); + +SELECT sleep(3) from numbers(4) settings max_block_size= 1 format Null; + +select 'view' src,* FROM TestTbl_view; +select 'dict' src,* FROM TestTblDict ; + +insert into TestTbl values(1, '2022-10-21', 'second'); + +SELECT sleep(3) from numbers(4) settings max_block_size= 1 format Null; + +select 'view' src,* FROM TestTbl_view; +select 'dict' src,* FROM TestTblDict ; + +DROP DICTIONARY IF EXISTS TestTblDict; +DROP VIEW IF EXISTS TestTbl_view; +DROP TABLE IF EXISTS TestTbl; + From ad19d2142f86d9b205d3635685e092e8d61eebe4 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Wed, 31 May 2023 20:17:54 -0300 Subject: [PATCH 0205/1072] ping ci From 5dbce62ec365f8d4801faef4ef5a0cd8eb117615 Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Wed, 31 May 2023 17:37:20 +0800 Subject: [PATCH 0206/1072] fixed. without apply prepareRightBlock will cause mismatch block structure --- src/Interpreters/GraceHashJoin.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Interpreters/GraceHashJoin.cpp b/src/Interpreters/GraceHashJoin.cpp index f54ee9d85c7..0dd61ff2793 100644 --- a/src/Interpreters/GraceHashJoin.cpp +++ b/src/Interpreters/GraceHashJoin.cpp @@ -623,6 +623,7 @@ Block GraceHashJoin::prepareRightBlock(const Block & block) void GraceHashJoin::addJoinedBlockImpl(Block block) { + block = prepareRightBlock(block); Buckets buckets_snapshot = getCurrentBuckets(); size_t bucket_index = current_bucket->idx; Block current_block; From 6334b6d5698f81429f5ae1351172ae89f5d7d2ea Mon Sep 17 00:00:00 2001 From: Manas Alekar Date: Wed, 31 May 2023 17:54:30 -0700 Subject: [PATCH 0207/1072] Address case where cpu cgroup is set to max. --- src/Common/AsynchronousMetrics.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/Common/AsynchronousMetrics.cpp b/src/Common/AsynchronousMetrics.cpp index 168dd3f0c4a..a4cb18249b6 100644 --- a/src/Common/AsynchronousMetrics.cpp +++ b/src/Common/AsynchronousMetrics.cpp @@ -945,12 +945,22 @@ void AsynchronousMetrics::update(TimePoint update_time) uint64_t quota = 0; uint64_t period = 0; - readText(quota, *cgroupcpu_max); - skipWhitespaceIfAny(*cgroupcpu_max); - readText(period, *cgroupcpu_max); + std::string line; + readText(line, *cgroupcpu_max); + + auto space = line.find_first_of(" "); + + if (line.rfind("max", 0) == std::string::npos) + { + auto field1 = line.substr(0, space); + quota = std::stoull(field1); + } + + auto field2 = line.substr(space + 1); + period = std::stoull(field2); new_values["CGroupCpuCfsPeriod"] = { period, "The CFS period of CPU cgroup."}; - new_values["CGroupCpuCfsQuota"] = { quota, "The CFS quota of CPU cgroup."}; + new_values["CGroupCpuCfsQuota"] = { quota, "The CFS quota of CPU cgroup. If stated zero, the quota is max."}; } catch (...) { @@ -970,7 +980,7 @@ void AsynchronousMetrics::update(TimePoint update_time) tryReadText(period, *cgroupcpu_cfs_period); new_values["CGroupCpuCfsPeriod"] = { period, "The CFS period of CPU cgroup."}; - new_values["CGroupCpuCfsQuota"] = { quota, "The CFS quota of CPU cgroup."}; + new_values["CGroupCpuCfsQuota"] = { quota, "The CFS quota of CPU cgroup. If stated zero, the quota is max."}; } catch (...) { From c9d0d217f5d180f5c78286b1b819534dd8cc1b59 Mon Sep 17 00:00:00 2001 From: frinkr Date: Wed, 31 May 2023 19:53:06 +0800 Subject: [PATCH 0208/1072] fix Keeper deadlock on exception when preprocessing requests --- src/Coordination/KeeperStateMachine.cpp | 11 ++++++++++- src/Coordination/KeeperStateMachine.h | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index 6635c74149a..f787cc8722e 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -272,7 +272,8 @@ bool KeeperStateMachine::preprocess(const KeeperStorage::RequestForSession & req } catch (...) { - rollbackRequest(request_for_session, true); + tryLogCurrentException(__PRETTY_FUNCTION__); + rollbackRequestNoLock(request_for_session, true); throw; } @@ -411,6 +412,14 @@ void KeeperStateMachine::rollbackRequest(const KeeperStorage::RequestForSession storage->rollbackRequest(request_for_session.zxid, allow_missing); } +void KeeperStateMachine::rollbackRequestNoLock(const KeeperStorage::RequestForSession & request_for_session, bool allow_missing) +{ + if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID) + return; + + storage->rollbackRequest(request_for_session.zxid, allow_missing); +} + nuraft::ptr KeeperStateMachine::last_snapshot() { /// Just return the latest snapshot. diff --git a/src/Coordination/KeeperStateMachine.h b/src/Coordination/KeeperStateMachine.h index fbd97fd8631..f6d81d23056 100644 --- a/src/Coordination/KeeperStateMachine.h +++ b/src/Coordination/KeeperStateMachine.h @@ -68,6 +68,8 @@ public: // (can happen in case of exception during preprocessing) void rollbackRequest(const KeeperStorage::RequestForSession & request_for_session, bool allow_missing); + void rollbackRequestNoLock(const KeeperStorage::RequestForSession & request_for_session, bool allow_missing); + uint64_t last_commit_index() override { return last_committed_idx; } /// Apply preliminarily saved (save_logical_snp_obj) snapshot to our state. From 08d98329b07ff772812999059a45af03352be030 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 1 Jun 2023 10:12:09 +0200 Subject: [PATCH 0209/1072] Revert "Add SQL functions for Entropy Learned Hashing" --- .../sql-reference/functions/hash-functions.md | 71 ---- src/Functions/EntropyLearnedHash.cpp | 395 ------------------ ...new_functions_must_be_documented.reference | 3 - .../02734_entropy_learned_hashing.reference | 18 - .../02734_entropy_learned_hashing.sql | 30 -- 5 files changed, 517 deletions(-) delete mode 100644 src/Functions/EntropyLearnedHash.cpp delete mode 100644 tests/queries/0_stateless/02734_entropy_learned_hashing.reference delete mode 100644 tests/queries/0_stateless/02734_entropy_learned_hashing.sql diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index 89afcca3799..8dfa03ceaf2 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -560,77 +560,6 @@ Result: └───────────────────────────┘ ``` -## Entropy-learned hashing (experimental) - -Entropy-learned hashing is disabled by default, to enable: `SET allow_experimental_hash_functions=1`. - -Entropy-learned hashing is not a standalone hash function like `metroHash64`, `cityHash64`, `sipHash64` etc. Instead, it aims to preprocess -the data to be hashed in a way that a standalone hash function can be computed more efficiently while not compromising the hash quality, -i.e. the randomness of the hashes. For that, entropy-based hashing chooses a subset of the bytes in a training data set of Strings which has -the same randomness (entropy) as the original Strings. For example, if the Strings are in average 100 bytes long, and we pick a subset of 5 -bytes, then a hash function will be 95% less expensive to evaluate. For details of the method, refer to [Entropy-Learned Hashing: Constant -Time Hashing with Controllable Uniformity](https://doi.org/10.1145/3514221.3517894). - -Entropy-learned hashing has two phases: - -1. A training phase on a representative but typically small set of Strings to be hashed. Training consists of two steps: - - - Function `prepareTrainEntropyLearnedHash(data, id)` caches the training data in a global state under a given `id`. It returns dummy - value `0` on every row. - - Function `trainEntropyLearnedHash(id)` computes a minimal partial sub-key of the training data stored stored under `id` in the global - state. The cached training data in the global state is replaced by the partial key. Dummy value `0` is returned on every row. - -2. An evaluation phase where hashes are computed using the previously calculated partial sub-keys. Function `entropyLearnedHash(data, id)` - hashes `data` using the partial subkey stored as `id`. CityHash64 is used as hash function. - -The reason that the training phase comprises two steps is that ClickHouse processes data at chunk granularity but entropy-learned hashing -needs to process the entire training set at once. - -Since functions `prepareTrainEntropyLearnedHash()` and `trainEntropyLearnedHash()` access global state, they should not be called in -parallel with the same `id`. - -**Syntax** - -``` sql -prepareTrainEntropyLearnedHash(data, id); -trainEntropyLearnedHash(id); -entropyLearnedHash(data, id); -``` - -**Example** - -```sql -SET allow_experimental_hash_functions=1; -CREATE TABLE tab (col String) ENGINE=Memory; -INSERT INTO tab VALUES ('aa'), ('ba'), ('ca'); - -SELECT prepareTrainEntropyLearnedHash(col, 'id1') AS prepared FROM tab; -SELECT trainEntropyLearnedHash('id1') AS trained FROM tab; -SELECT entropyLearnedHash(col, 'id1') as hashes FROM tab; -``` - -Result: - -``` response -┌─prepared─┐ -│ 0 │ -│ 0 │ -│ 0 │ -└──────────┘ - -┌─trained─┐ -│ 0 │ -│ 0 │ -│ 0 │ -└─────────┘ - -┌───────────────hashes─┐ -│ 2603192927274642682 │ -│ 4947675599669400333 │ -│ 10783339242466472992 │ -└──────────────────────┘ -``` - ## metroHash64 Produces a 64-bit [MetroHash](http://www.jandrewrogers.com/2015/05/27/metrohash/) hash value. diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp deleted file mode 100644 index 854379bbb9d..00000000000 --- a/src/Functions/EntropyLearnedHash.cpp +++ /dev/null @@ -1,395 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/// Implementation of entropy-learned hashing: https://doi.org/10.1145/3514221.3517894 -/// If you change something in this file, please don't deviate too much from the pseudocode in the paper! - -/// TODOs for future work: -/// - allow to specify an arbitrary hash function (currently always CityHash is used) -/// - allow function chaining a la entropyLearnedHash(trainEntropyLearnedHash()) -/// - support more datatypes for data (besides String) - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int BAD_ARGUMENTS; - extern const int ILLEGAL_COLUMN; - extern const int SUPPORT_IS_DISABLED; -} - -namespace -{ - -using PartialKeyPositions = std::vector; -using Entropies = std::vector; - -void getPartialKey(std::string_view key, const PartialKeyPositions & partial_key_positions, String & result) -{ - result.clear(); - result.reserve(partial_key_positions.size()); - - for (auto partial_key_position : partial_key_positions) - if (partial_key_position < key.size()) - result.push_back(key[partial_key_position]); -} - -bool allPartialKeysAreUnique(const std::vector & keys, const PartialKeyPositions & partial_key_positions) -{ - std::unordered_set unique_partial_keys; - unique_partial_keys.reserve(keys.size()); - String partial_key; - - for (const auto & key : keys) - { - getPartialKey(key, partial_key_positions, partial_key); - if (!unique_partial_keys.insert(partial_key).second) - return false; - } - - return true; -} - -// NextByte returns position of byte which adds the most entropy and the new entropy -std::pair nextByte(const std::vector & keys, size_t max_len, PartialKeyPositions & partial_key_positions) -{ - size_t min_collisions = std::numeric_limits::max(); - size_t best_position = 0; - - std::unordered_map count_table; - count_table.reserve(keys.size()); - - String partial_key; - - for (size_t i = 0; i < max_len; ++i) - { - count_table.clear(); - - partial_key_positions.push_back(i); - size_t collisions = 0; - for (const auto & key : keys) - { - getPartialKey(key, partial_key_positions, partial_key); - collisions += count_table[partial_key]++; - } - - if (collisions < min_collisions) - { - min_collisions = collisions; - best_position = i; - } - partial_key_positions.pop_back(); - } - - return {best_position, min_collisions}; -} - -std::pair chooseBytes(const std::vector & train_data) -{ - if (train_data.size() <= 1) - return {}; - - PartialKeyPositions partial_key_positions; - Entropies entropies; - - size_t max_len = 0; /// length of the longest key in training data - for (const auto & key : train_data) - max_len = std::max(max_len, key.size()); - - while (!allPartialKeysAreUnique(train_data, partial_key_positions)) - { - auto [new_position, new_entropy] = nextByte(train_data, max_len, partial_key_positions); - if (!entropies.empty() && new_entropy == entropies.back()) - break; - partial_key_positions.push_back(new_position); - entropies.push_back(new_entropy); - } - return {partial_key_positions, entropies}; -} - -/// Contains global state to convey information between SQL functions -/// - prepareTrainEntropyLearnedHash(), -/// - trainEntropyLearnedHash() and -/// - entropyLearnedHash(). -/// -/// The reason this machinery is necessary is that ClickHouse processes data in chunks of unpredictable size, yet the training step of -/// entropy-learned hashing needs to process *all* training data in one go. The downside is that the training step becomes quite expensive :-( -class EntropyLearnedHashGlobalState -{ -public: - static EntropyLearnedHashGlobalState & instance() - { - static EntropyLearnedHashGlobalState instance; - return instance; - } - - /// Called by prepareTrainEntropyLearnedHash() - void cacheTrainingSample(const String & user_name, const String & id, IColumn::MutablePtr column) - { - std::lock_guard lock(mutex); - auto & ids_for_user = global_state[user_name]; - auto & training_samples_for_id = ids_for_user[id].training_samples; - training_samples_for_id.push_back(std::move(column)); - } - - void train(const String & user_name, const String & id) - { - std::lock_guard lock(mutex); - auto & ids_for_user = global_state[user_name]; - auto & training_samples = ids_for_user[id].training_samples; - - if (training_samples.empty()) - return; - - auto & concatenated_training_sample = training_samples[0]; - for (size_t i = 1; i < training_samples.size(); ++i) - { - auto & other_training_sample = training_samples[i]; - concatenated_training_sample->insertRangeFrom(*other_training_sample, 0, other_training_sample->size()); - } - - const ColumnString * concatenated_training_sample_string = checkAndGetColumn(*concatenated_training_sample); - if (!concatenated_training_sample_string) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column"); - - const size_t num_rows = concatenated_training_sample_string->size(); - std::vector training_data; - for (size_t i = 0; i < num_rows; ++i) - { - std::string_view string_view = concatenated_training_sample_string->getDataAt(i).toView(); - training_data.emplace_back(string_view); - } - - PartialKeyPositions partial_key_positions = chooseBytes(training_data).first; - - ids_for_user[id].partial_key_positions = partial_key_positions; - training_samples.clear(); - } - - const PartialKeyPositions & getPartialKeyPositions(const String & user_name, const String & id) const - { - std::lock_guard lock(mutex); - auto it_user = global_state.find(user_name); - if (it_user == global_state.end()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Id {} not registered for user in entropy learned hashing", id); - auto it_id = it_user->second.find(id); - if (it_id == it_user->second.end()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Id {} not registered for user in entropy learned hashing", id); - return it_id->second.partial_key_positions; - } - -private: - mutable std::mutex mutex; - - /// The state. - struct ColumnsAndPartialKeyPositions - { - /// Caches training data chunks. Filled by prepareTrainEntropyLearnedHash(), cleared by trainEntropyLearnedHash(). - MutableColumns training_samples; - /// The result of the training phase. Filled by trainEntropyLearnedHash(). - PartialKeyPositions partial_key_positions; - }; - - /// Maps a state id to the state. - using IdToColumnsAndPartialKeyPositions = std::map; - - /// Maps the user name to a state id. As a result, the state id is unique at user scope. - using UserNameToId = std::map; - - UserNameToId global_state TSA_GUARDED_BY(mutex); -}; - -} - - -/// Copies all chunks of the training sample column into the global state under a given id. -class FunctionPrepareTrainEntropyLearnedHash : public IFunction -{ -public: - static constexpr auto name = "prepareTrainEntropyLearnedHash"; - static FunctionPtr create(ContextPtr context) - { - if (!context->getSettings().allow_experimental_hash_functions) - throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, - "Entropy-learned hashing is experimental. Set `allow_experimental_hash_functions` setting to enable it"); - - return std::make_shared(context->getUserName()); - } - explicit FunctionPrepareTrainEntropyLearnedHash(const String & user_name_) : IFunction(), user_name(user_name_) {} - - String getName() const override { return name; } - size_t getNumberOfArguments() const override { return 2; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } - bool useDefaultImplementationForConstants() const override { return true; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } - - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override - { - FunctionArgumentDescriptors args{ - {"data", &isString, nullptr, "String"}, - {"id", &isString, nullptr, "String"} - }; - - validateFunctionArgumentTypes(*this, arguments, args); - - return std::make_shared(); - } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t) const override - { - const IColumn * id_col = arguments[1].column.get(); - const ColumnConst * id_col_const = checkAndGetColumn(id_col); - const String id = id_col_const->getValue(); - - IColumn::Ptr data_col = arguments[0].column; - IColumn::MutablePtr data_col_mutable = IColumn::mutate(data_col); - - auto & global_state = EntropyLearnedHashGlobalState::instance(); - global_state.cacheTrainingSample(user_name, id, std::move(data_col_mutable)); - - const size_t num_rows = data_col->size(); - return result_type->createColumnConst(num_rows, 0u); /// dummy output - } -private: - const String user_name; -}; - - -/// 1. Concatenates the training samples of a given id in the global state. -/// 2. Computes the partial key positions from the concatenated training samples and stores that in the global state. -/// 3. clear()-s the training samples in the global state. -class FunctionTrainEntropyLearnedHash : public IFunction -{ -public: - static constexpr auto name = "trainEntropyLearnedHash"; - static FunctionPtr create(ContextPtr context) - { - if (!context->getSettings().allow_experimental_hash_functions) - throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, - "Entropy-learned hashing is experimental. Set `allow_experimental_hash_functions` setting to enable it"); - return std::make_shared(context->getUserName()); - } - explicit FunctionTrainEntropyLearnedHash(const String & user_name_) : IFunction(), user_name(user_name_) {} - - String getName() const override { return name; } - size_t getNumberOfArguments() const override { return 1; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } - bool useDefaultImplementationForConstants() const override { return false; } - - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override - { - FunctionArgumentDescriptors args{ - {"id", &isString, nullptr, "String"} - }; - - validateFunctionArgumentTypes(*this, arguments, args); - - return std::make_shared(); - } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t) const override - { - const IColumn * id_col = arguments[0].column.get(); - const ColumnConst * id_col_const = checkAndGetColumn(id_col); - if (!id_col_const) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", - arguments.begin()->column->getName(), getName()); - - auto & global_state = EntropyLearnedHashGlobalState::instance(); - - const String id = id_col_const->getValue(); - global_state.train(user_name, id); - - const size_t num_rows = id_col->size(); - return result_type->createColumnConst(num_rows, 0u); /// dummy output - } -private: - const String user_name; -}; - - -/// Hashes input strings using partial key positions stored in the global state. -class FunctionEntropyLearnedHash : public IFunction -{ -public: - static constexpr auto name = "entropyLearnedHash"; - static FunctionPtr create(ContextPtr context) - { - if (!context->getSettings().allow_experimental_hash_functions) - throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, - "Entropy-learned hashing experimental. Set `allow_experimental_hash_functions` setting to enable it"); - return std::make_shared(context->getUserName()); - } - explicit FunctionEntropyLearnedHash(const String & user_name_) : IFunction(), user_name(user_name_) {} - - String getName() const override { return name; } - size_t getNumberOfArguments() const override { return 2; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } - bool useDefaultImplementationForConstants() const override { return true; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } - - - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override - { - FunctionArgumentDescriptors args{ - {"data", &isString, nullptr, "String"}, - {"id", &isString, nullptr, "String"} - }; - - validateFunctionArgumentTypes(*this, arguments, args); - - return std::make_shared(); - } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override - { - const IColumn * id_col = arguments.back().column.get(); - const ColumnConst * id_col_const = checkAndGetColumn(id_col); - const String id = id_col_const->getValue(); - - const auto & global_state = EntropyLearnedHashGlobalState::instance(); - const auto & partial_key_positions = global_state.getPartialKeyPositions(user_name, id); - - const auto * data_col = arguments[0].column.get(); - if (const auto * col_data_string = checkAndGetColumn(data_col)) - { - const size_t num_rows = col_data_string->size(); - auto col_res = ColumnUInt64::create(num_rows); - - auto & col_res_vec = col_res->getData(); - String partial_key; - for (size_t i = 0; i < num_rows; ++i) - { - std::string_view string_ref = col_data_string->getDataAt(i).toView(); - getPartialKey(string_ref, partial_key_positions, partial_key); - col_res_vec[i] = CityHash_v1_0_2::CityHash64(partial_key.data(), partial_key.size()); - } - - return col_res; - } - else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", - arguments.begin()->column->getName(), getName()); - } -private: - const String user_name; -}; - -REGISTER_FUNCTION(EntropyLearnedHash) -{ - factory.registerFunction(); - factory.registerFunction(); - factory.registerFunction(); -} - -} diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference index b2c9073648e..7ab26982402 100644 --- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference +++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference @@ -266,7 +266,6 @@ encodeURLComponent encodeURLFormComponent encodeXMLComponent endsWith -entropyLearnedHash equals erf erfc @@ -559,7 +558,6 @@ positionCaseInsensitive positionCaseInsensitiveUTF8 positionUTF8 pow -prepareTrainEntropyLearnedHash proportionsZTest protocol queryID @@ -864,7 +862,6 @@ toYear toYearWeek today tokens -trainEntropyLearnedHash transactionID transactionLatestSnapshot transactionOldestSnapshot diff --git a/tests/queries/0_stateless/02734_entropy_learned_hashing.reference b/tests/queries/0_stateless/02734_entropy_learned_hashing.reference deleted file mode 100644 index f558e3cd444..00000000000 --- a/tests/queries/0_stateless/02734_entropy_learned_hashing.reference +++ /dev/null @@ -1,18 +0,0 @@ -0 -0 -0 -0 -0 -0 -2603192927274642682 -4947675599669400333 -10783339242466472992 -0 -0 -0 -0 -0 -0 -2603192927274642682 -4947675599669400333 -10783339242466472992 diff --git a/tests/queries/0_stateless/02734_entropy_learned_hashing.sql b/tests/queries/0_stateless/02734_entropy_learned_hashing.sql deleted file mode 100644 index ae829fa03c3..00000000000 --- a/tests/queries/0_stateless/02734_entropy_learned_hashing.sql +++ /dev/null @@ -1,30 +0,0 @@ --- Tags: no-parallel --- no-parallel because entropy-learned hash uses global state - -SET allow_experimental_hash_functions = 1; - --- no commonalities between keys -DROP TABLE IF EXISTS tbl1; -CREATE TABLE tbl1 (x String) ENGINE=Memory; -INSERT INTO tbl1 VALUES ('a'), ('b'), ('c'); -SELECT prepareTrainEntropyLearnedHash(x, 'id1') FROM tbl1; -SELECT trainEntropyLearnedHash('id1') FROM tbl1; -SELECT entropyLearnedHash(x, 'id1') FROM tbl1; - --- with commonalities between keys -DROP TABLE IF EXISTS tbl2; -CREATE TABLE tbl2 (x String) ENGINE=Memory; -INSERT INTO tbl2 VALUES ('aa'), ('ba'), ('ca'); -SELECT prepareTrainEntropyLearnedHash(x, 'id2') FROM tbl2; -SELECT trainEntropyLearnedHash('id2') FROM tbl2; -SELECT entropyLearnedHash(x, 'id2') FROM tbl2; - --- negative tests -SELECT prepareTrainEntropyLearnedHash(x, 1) FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } -SELECT prepareTrainEntropyLearnedHash(1, 'id1') FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } -SELECT trainEntropyLearnedHash(1) FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } -SELECT entropyLearnedHash(1, 'id1') FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } -SELECT entropyLearnedHash(x, 'non-existing id') FROM tbl1; -- { serverError BAD_ARGUMENTS } - -DROP TABLE tbl1; -DROP TABLE tbl2; From 68d46c81b99726a1e0467fc21d02a5311b1e49ca Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 11 Apr 2023 09:53:48 +0000 Subject: [PATCH 0210/1072] Do not store blocks in hash join if nothing is inseted --- src/Interpreters/HashJoin.cpp | 55 +++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 146b57049a6..92597f87f4b 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -543,16 +543,20 @@ namespace template struct Inserter { - static ALWAYS_INLINE void insertOne(const HashJoin & join, Map & map, KeyGetter & key_getter, Block * stored_block, size_t i, + static ALWAYS_INLINE bool insertOne(const HashJoin & join, Map & map, KeyGetter & key_getter, Block * stored_block, size_t i, Arena & pool) { auto emplace_result = key_getter.emplaceKey(map, i, pool); if (emplace_result.isInserted() || join.anyTakeLastRow()) + { new (&emplace_result.getMapped()) typename Map::mapped_type(stored_block, i); + return true; + } + return false; } - static ALWAYS_INLINE void insertAll(const HashJoin &, Map & map, KeyGetter & key_getter, Block * stored_block, size_t i, Arena & pool) + static ALWAYS_INLINE bool insertAll(const HashJoin &, Map & map, KeyGetter & key_getter, Block * stored_block, size_t i, Arena & pool) { auto emplace_result = key_getter.emplaceKey(map, i, pool); @@ -563,9 +567,10 @@ namespace /// The first element of the list is stored in the value of the hash table, the rest in the pool. emplace_result.getMapped().insert({stored_block, i}, pool); } + return true; } - static ALWAYS_INLINE void insertAsof(HashJoin & join, Map & map, KeyGetter & key_getter, Block * stored_block, size_t i, Arena & pool, + static ALWAYS_INLINE bool insertAsof(HashJoin & join, Map & map, KeyGetter & key_getter, Block * stored_block, size_t i, Arena & pool, const IColumn & asof_column) { auto emplace_result = key_getter.emplaceKey(map, i, pool); @@ -575,6 +580,7 @@ namespace if (emplace_result.isInserted()) time_series_map = new (time_series_map) typename Map::mapped_type(createAsofRowRef(asof_type, join.getAsofInequality())); (*time_series_map)->insert(asof_column, stored_block, i); + return true; } }; @@ -582,7 +588,7 @@ namespace template size_t NO_INLINE insertFromBlockImplTypeCase( HashJoin & join, Map & map, size_t rows, const ColumnRawPtrs & key_columns, - const Sizes & key_sizes, Block * stored_block, ConstNullMapPtr null_map, UInt8ColumnDataPtr join_mask, Arena & pool) + const Sizes & key_sizes, Block * stored_block, ConstNullMapPtr null_map, UInt8ColumnDataPtr join_mask, Arena & pool, bool & is_inserted) { [[maybe_unused]] constexpr bool mapped_one = std::is_same_v; constexpr bool is_asof_join = STRICTNESS == JoinStrictness::Asof; @@ -593,6 +599,7 @@ namespace auto key_getter = createKeyGetter(key_columns, key_sizes); + is_inserted = false; for (size_t i = 0; i < rows; ++i) { if (has_null_map && (*null_map)[i]) @@ -603,11 +610,11 @@ namespace continue; if constexpr (is_asof_join) - Inserter::insertAsof(join, map, key_getter, stored_block, i, pool, *asof_column); + is_inserted |= Inserter::insertAsof(join, map, key_getter, stored_block, i, pool, *asof_column); else if constexpr (mapped_one) - Inserter::insertOne(join, map, key_getter, stored_block, i, pool); + is_inserted |= Inserter::insertOne(join, map, key_getter, stored_block, i, pool); else - Inserter::insertAll(join, map, key_getter, stored_block, i, pool); + is_inserted |= Inserter::insertAll(join, map, key_getter, stored_block, i, pool); } return map.getBufferSizeInCells(); } @@ -616,32 +623,37 @@ namespace template size_t insertFromBlockImplType( HashJoin & join, Map & map, size_t rows, const ColumnRawPtrs & key_columns, - const Sizes & key_sizes, Block * stored_block, ConstNullMapPtr null_map, UInt8ColumnDataPtr join_mask, Arena & pool) + const Sizes & key_sizes, Block * stored_block, ConstNullMapPtr null_map, UInt8ColumnDataPtr join_mask, Arena & pool, bool & is_inserted) { if (null_map) return insertFromBlockImplTypeCase( - join, map, rows, key_columns, key_sizes, stored_block, null_map, join_mask, pool); + join, map, rows, key_columns, key_sizes, stored_block, null_map, join_mask, pool, is_inserted); else return insertFromBlockImplTypeCase( - join, map, rows, key_columns, key_sizes, stored_block, null_map, join_mask, pool); + join, map, rows, key_columns, key_sizes, stored_block, null_map, join_mask, pool, is_inserted); } template size_t insertFromBlockImpl( HashJoin & join, HashJoin::Type type, Maps & maps, size_t rows, const ColumnRawPtrs & key_columns, - const Sizes & key_sizes, Block * stored_block, ConstNullMapPtr null_map, UInt8ColumnDataPtr join_mask, Arena & pool) + const Sizes & key_sizes, Block * stored_block, ConstNullMapPtr null_map, UInt8ColumnDataPtr join_mask, Arena & pool, bool & is_inserted) { switch (type) { - case HashJoin::Type::EMPTY: return 0; - case HashJoin::Type::CROSS: return 0; /// Do nothing. We have already saved block, and it is enough. + case HashJoin::Type::EMPTY: + [[fallthrough]]; + case HashJoin::Type::CROSS: + /// Do nothing. We will only save block, and it is enough + is_inserted = true; + return 0; #define M(TYPE) \ case HashJoin::Type::TYPE: \ return insertFromBlockImplType>::Type>(\ - join, *maps.TYPE, rows, key_columns, key_sizes, stored_block, null_map, join_mask, pool); \ + join, *maps.TYPE, rows, key_columns, key_sizes, stored_block, null_map, join_mask, pool, is_inserted); \ break; + APPLY_FOR_JOIN_VARIANTS(M) #undef M } @@ -816,6 +828,7 @@ bool HashJoin::addJoinedBlock(const Block & source_block_, bool check_limits) } } + bool is_inserted = false; if (kind != JoinKind::Cross) { joinDispatch(kind, strictness, data->maps[onexpr_idx], [&](auto kind_, auto strictness_, auto & map) @@ -824,28 +837,34 @@ bool HashJoin::addJoinedBlock(const Block & source_block_, bool check_limits) *this, data->type, map, rows, key_columns, key_sizes[onexpr_idx], stored_block, null_map, /// If mask is false constant, rows are added to hashmap anyway. It's not a happy-flow, so this case is not optimized join_mask_col.getData(), - data->pool); + data->pool, is_inserted); if (multiple_disjuncts) used_flags.reinit(stored_block); - else + else if (is_inserted) /// Number of buckets + 1 value from zero storage used_flags.reinit(size + 1); }); } - if (!multiple_disjuncts && save_nullmap) + if (!multiple_disjuncts && save_nullmap && is_inserted) { data->blocks_nullmaps_allocated_size += null_map_holder->allocatedBytes(); data->blocks_nullmaps.emplace_back(stored_block, null_map_holder); } - if (!multiple_disjuncts && not_joined_map) + if (!multiple_disjuncts && not_joined_map && is_inserted) { data->blocks_nullmaps_allocated_size += not_joined_map->allocatedBytes(); data->blocks_nullmaps.emplace_back(stored_block, std::move(not_joined_map)); } + if (!multiple_disjuncts && !is_inserted) + { + data->blocks_allocated_size -= stored_block->allocatedBytes(); + data->blocks.pop_back(); + } + if (!check_limits) return true; From 62f950ddaff7178fc479e2ccab236be39567e0a7 Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 3 May 2023 12:15:28 +0000 Subject: [PATCH 0211/1072] Keep blocks with nulls for right and full join --- src/Interpreters/HashJoin.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 92597f87f4b..436ecd382cd 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -603,7 +603,12 @@ namespace for (size_t i = 0; i < rows; ++i) { if (has_null_map && (*null_map)[i]) + { + /// nulls are not inserted into hash table, + /// keep them for RIGHT and FULL joins + is_inserted = true; continue; + } /// Check condition for right table from ON section if (join_mask && !(*join_mask)[i]) @@ -861,6 +866,7 @@ bool HashJoin::addJoinedBlock(const Block & source_block_, bool check_limits) if (!multiple_disjuncts && !is_inserted) { + LOG_TRACE(log, "Skipping inserting block with {} rows", rows); data->blocks_allocated_size -= stored_block->allocatedBytes(); data->blocks.pop_back(); } From 0f0958f82d5048c0aa3b28bdf9c5ce655e853219 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 4 May 2023 14:08:03 +0000 Subject: [PATCH 0212/1072] Add test 02725_any_join_single_row --- .../02725_any_join_single_row.reference | 3 +++ .../0_stateless/02725_any_join_single_row.sql | 26 +++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 tests/queries/0_stateless/02725_any_join_single_row.reference create mode 100644 tests/queries/0_stateless/02725_any_join_single_row.sql diff --git a/tests/queries/0_stateless/02725_any_join_single_row.reference b/tests/queries/0_stateless/02725_any_join_single_row.reference new file mode 100644 index 00000000000..5d748fc6dbb --- /dev/null +++ b/tests/queries/0_stateless/02725_any_join_single_row.reference @@ -0,0 +1,3 @@ +Join(ANY, LEFT, key) 0 1 +Join(ANY, LEFT, key) 1 1 +Join(ANY, LEFT, key) 1 1 diff --git a/tests/queries/0_stateless/02725_any_join_single_row.sql b/tests/queries/0_stateless/02725_any_join_single_row.sql new file mode 100644 index 00000000000..5e5c959c278 --- /dev/null +++ b/tests/queries/0_stateless/02725_any_join_single_row.sql @@ -0,0 +1,26 @@ +CREATE TABLE join_test +( + `key` UInt64, + `value` UInt64 +) +ENGINE = Join(ANY, LEFT, key); + +CREATE TEMPORARY TABLE initial_table_size AS + SELECT engine_full, total_rows, total_bytes FROM system.tables WHERE (name = 'join_test') AND (database = currentDatabase()); + +SELECT engine_full, total_rows, total_bytes < 100_000 FROM initial_table_size; + +INSERT INTO join_test (key, value) SELECT 1, number FROM numbers(1); + + +CREATE TEMPORARY TABLE one_row_table_size AS + SELECT engine_full, total_rows, total_bytes FROM system.tables WHERE (name = 'join_test') AND (database = currentDatabase()); + +SELECT engine_full, total_rows, total_bytes < 2 * (SELECT total_bytes FROM initial_table_size) FROM one_row_table_size; + +INSERT INTO join_test (key, value) SELECT 1, number FROM numbers(1); +INSERT INTO join_test (key, value) SELECT 1, number FROM numbers(1); +INSERT INTO join_test (key, value) SELECT 1, number FROM numbers(1); +INSERT INTO join_test (key, value) SELECT 1, number FROM numbers(10_000); + +SELECT engine_full, total_rows, total_bytes == (SELECT total_bytes FROM one_row_table_size) FROM system.tables WHERE (name = 'join_test') AND (database = currentDatabase()); From 56fa98cb77c30cba3f93dfcd5b04c33dd5b1ec68 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 1 Jun 2023 09:23:39 +0000 Subject: [PATCH 0213/1072] Rename variables for better readability --- src/Functions/FunctionsHashing.h | 71 ++++++++++++++++---------------- 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h index 3de757bfa3f..70adc7eba0f 100644 --- a/src/Functions/FunctionsHashing.h +++ b/src/Functions/FunctionsHashing.h @@ -1073,42 +1073,43 @@ private: size_t size = vec_from.size(); for (size_t i = 0; i < size; ++i) { - ToType h; + ToType hash; if constexpr (Impl::use_int_hash_for_pods) { if constexpr (std::is_same_v) - h = IntHash64Impl::apply(bit_cast(vec_from[i])); + hash = IntHash64Impl::apply(bit_cast(vec_from[i])); else - h = IntHash32Impl::apply(bit_cast(vec_from[i])); + hash = IntHash32Impl::apply(bit_cast(vec_from[i])); } else { if constexpr (std::is_same_v) - h = JavaHashImpl::apply(vec_from[i]); + hash = JavaHashImpl::apply(vec_from[i]); else { - FromType v = vec_from[i]; + FromType value = vec_from[i]; if constexpr (std::endian::native == std::endian::big) { - FromType tmp_v; - reverseMemcpy(&tmp_v, &v, sizeof(v)); - v = tmp_v; + FromType value_reversed; + reverseMemcpy(&value_reversed, &value, sizeof(value)); + value = value_reversed; } - h = apply(key, reinterpret_cast(&v), sizeof(v)); + hash = apply(key, reinterpret_cast(&value), sizeof(value)); } } if constexpr (first) - vec_to[i] = h; + vec_to[i] = hash; else - vec_to[i] = combineHashes(key, vec_to[i], h); + vec_to[i] = combineHashes(key, vec_to[i], hash); } } else if (auto col_from_const = checkAndGetColumnConst(column)) { auto value = col_from_const->template getValue(); ToType hash; + if constexpr (std::is_same_v) hash = IntHash64Impl::apply(bit_cast(value)); else @@ -1139,45 +1140,45 @@ private: size_t size = vec_from.size(); for (size_t i = 0; i < size; ++i) { - ToType h; + ToType hash; if constexpr (std::endian::native == std::endian::little) { - h = apply(key, reinterpret_cast(&vec_from[i]), sizeof(vec_from[i])); + hash = apply(key, reinterpret_cast(&vec_from[i]), sizeof(vec_from[i])); } else { char tmp_buffer[sizeof(vec_from[i])]; reverseMemcpy(tmp_buffer, &vec_from[i], sizeof(vec_from[i])); - h = apply(key, reinterpret_cast(tmp_buffer), sizeof(vec_from[i])); + hash = apply(key, reinterpret_cast(tmp_buffer), sizeof(vec_from[i])); } if constexpr (first) - vec_to[i] = h; + vec_to[i] = hash; else - vec_to[i] = combineHashes(key, vec_to[i], h); + vec_to[i] = combineHashes(key, vec_to[i], hash); } } else if (auto col_from_const = checkAndGetColumnConst(column)) { auto value = col_from_const->template getValue(); - ToType h; + ToType hash; if constexpr (std::endian::native == std::endian::little) { - h = apply(key, reinterpret_cast(&value), sizeof(value)); + hash = apply(key, reinterpret_cast(&value), sizeof(value)); } else { char tmp_buffer[sizeof(value)]; reverseMemcpy(tmp_buffer, &value, sizeof(value)); - h = apply(key, reinterpret_cast(tmp_buffer), sizeof(value)); + hash = apply(key, reinterpret_cast(tmp_buffer), sizeof(value)); } size_t size = vec_to.size(); if constexpr (first) - vec_to.assign(size, h); + vec_to.assign(size, hash); else { for (size_t i = 0; i < size; ++i) - vec_to[i] = combineHashes(key, vec_to[i], h); + vec_to[i] = combineHashes(key, vec_to[i], hash); } } else @@ -1191,11 +1192,11 @@ private: for (size_t i = 0, size = column->size(); i < size; ++i) { StringRef bytes = column->getDataAt(i); - const ToType h = apply(key, bytes.data, bytes.size); + const ToType hash = apply(key, bytes.data, bytes.size); if constexpr (first) - vec_to[i] = h; + vec_to[i] = hash; else - vec_to[i] = combineHashes(key, vec_to[i], h); + vec_to[i] = combineHashes(key, vec_to[i], hash); } } @@ -1211,14 +1212,14 @@ private: ColumnString::Offset current_offset = 0; for (size_t i = 0; i < size; ++i) { - const ToType h = apply(key, + const ToType hash = apply(key, reinterpret_cast(&data[current_offset]), offsets[i] - current_offset - 1); if constexpr (first) - vec_to[i] = h; + vec_to[i] = hash; else - vec_to[i] = combineHashes(key, vec_to[i], h); + vec_to[i] = combineHashes(key, vec_to[i], hash); current_offset = offsets[i]; } @@ -1231,11 +1232,11 @@ private: for (size_t i = 0; i < size; ++i) { - const ToType h = apply(key, reinterpret_cast(&data[i * n]), n); + const ToType hash = apply(key, reinterpret_cast(&data[i * n]), n); if constexpr (first) - vec_to[i] = h; + vec_to[i] = hash; else - vec_to[i] = combineHashes(key, vec_to[i], h); + vec_to[i] = combineHashes(key, vec_to[i], hash); } } else if (const ColumnConst * col_from_const = checkAndGetColumnConstStringOrFixedString(column)) @@ -1283,16 +1284,16 @@ private: { ColumnArray::Offset next_offset = offsets[i]; - ToType h; + ToType hash; if constexpr (std::is_same_v) - h = IntHash64Impl::apply(next_offset - current_offset); + hash = IntHash64Impl::apply(next_offset - current_offset); else - h = IntHash32Impl::apply(next_offset - current_offset); + hash = IntHash32Impl::apply(next_offset - current_offset); if constexpr (first) - vec_to[i] = h; + vec_to[i] = hash; else - vec_to[i] = combineHashes(key, vec_to[i], h); + vec_to[i] = combineHashes(key, vec_to[i], hash); for (size_t j = current_offset; j < next_offset; ++j) vec_to[i] = combineHashes(key, vec_to[i], vec_temp[j]); From 93ba75b370c6ece34fe8480440193b4775b3d105 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 1 Jun 2023 12:23:04 +0000 Subject: [PATCH 0214/1072] Remove parentheses from single-line if/for --- src/Functions/FunctionsHashing.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h index 70adc7eba0f..9896adca7f3 100644 --- a/src/Functions/FunctionsHashing.h +++ b/src/Functions/FunctionsHashing.h @@ -1119,10 +1119,8 @@ private: if constexpr (first) vec_to.assign(size, hash); else - { for (size_t i = 0; i < size; ++i) vec_to[i] = combineHashes(key, vec_to[i], hash); - } } else throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", @@ -1142,9 +1140,7 @@ private: { ToType hash; if constexpr (std::endian::native == std::endian::little) - { hash = apply(key, reinterpret_cast(&vec_from[i]), sizeof(vec_from[i])); - } else { char tmp_buffer[sizeof(vec_from[i])]; @@ -1163,9 +1159,7 @@ private: ToType hash; if constexpr (std::endian::native == std::endian::little) - { hash = apply(key, reinterpret_cast(&value), sizeof(value)); - } else { char tmp_buffer[sizeof(value)]; @@ -1176,10 +1170,8 @@ private: if constexpr (first) vec_to.assign(size, hash); else - { for (size_t i = 0; i < size; ++i) vec_to[i] = combineHashes(key, vec_to[i], hash); - } } else throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", @@ -1246,16 +1238,10 @@ private: const size_t size = vec_to.size(); if constexpr (first) - { vec_to.assign(size, hash); - } else - { for (size_t i = 0; i < size; ++i) - { vec_to[i] = combineHashes(key, vec_to[i], hash); - } - } } else throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", From 3f08e3e03f7f61a7e70a5ed89ed66f09f3002f52 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Thu, 1 Jun 2023 14:53:07 +0200 Subject: [PATCH 0215/1072] make filter push down through cross join --- .../Optimizations/filterPushDown.cpp | 6 ++--- .../01763_filter_push_down_bugs.reference | 22 +++++++++++++++++++ .../01763_filter_push_down_bugs.sql | 16 ++++++++++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index 37bc894339f..db29038999b 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -272,7 +272,7 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes { /// If totals step has HAVING expression, skip it for now. /// TODO: - /// We can merge HAVING expression with current filer. + /// We can merge HAVING expression with current filter. /// Also, we can push down part of HAVING which depend only on aggregation keys. if (totals_having->getActions()) return 0; @@ -323,9 +323,9 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes { const auto & table_join = join ? join->getJoin()->getTableJoin() : filled_join->getJoin()->getTableJoin(); - /// Only inner and left(/right) join are supported. Other types may generate default values for left table keys. + /// Only inner, cross and left(/right) join are supported. Other types may generate default values for left table keys. /// So, if we push down a condition like `key != 0`, not all rows may be filtered. - if (table_join.kind() != JoinKind::Inner && table_join.kind() != kind) + if (table_join.kind() != JoinKind::Inner && table_join.kind() != JoinKind::Cross && table_join.kind() != kind) return 0; bool is_left = kind == JoinKind::Left; diff --git a/tests/queries/0_stateless/01763_filter_push_down_bugs.reference b/tests/queries/0_stateless/01763_filter_push_down_bugs.reference index 5aa2e645509..eb4e88a1f81 100644 --- a/tests/queries/0_stateless/01763_filter_push_down_bugs.reference +++ b/tests/queries/0_stateless/01763_filter_push_down_bugs.reference @@ -6,3 +6,25 @@ String1_0 String2_0 String3_0 String4_0 1 String1_0 String2_0 String3_0 String4_0 1 1 [0,1,2] 1 +Expression ((Projection + Before ORDER BY)) + Filter (WHERE) + Join (JOIN FillRightFirst) + Filter (( + Before JOIN)) + ReadFromMergeTree (default.t1) + Indexes: + PrimaryKey + Keys: + id + Condition: (id in 1-element set) + Parts: 0/19 + Granules: 0/1204 + Expression ((Joined actions + (Rename joined columns + (Projection + Before ORDER BY)))) + Filter (WHERE) + ReadFromMergeTree (default.t2) + Indexes: + PrimaryKey + Keys: + delete_time + Condition: (delete_time in [1685397601, +Inf)) + Parts: 0/1 + Granules: 0/1 diff --git a/tests/queries/0_stateless/01763_filter_push_down_bugs.sql b/tests/queries/0_stateless/01763_filter_push_down_bugs.sql index 1058bf75144..917c350dadb 100644 --- a/tests/queries/0_stateless/01763_filter_push_down_bugs.sql +++ b/tests/queries/0_stateless/01763_filter_push_down_bugs.sql @@ -38,6 +38,22 @@ DROP TABLE IF EXISTS Test; select x, y from (select [0, 1, 2] as y, 1 as a, 2 as b) array join y as x where a = 1 and b = 2 and (x = 1 or x != 1) and x = 1; +DROP TABLE IF EXISTS t; create table t(a UInt8) engine=MergeTree order by a; insert into t select * from numbers(2); select a from t t1 join t t2 on t1.a = t2.a where t1.a; +DROP TABLE IF EXISTS t; + +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; +CREATE TABLE t1 (id Int64, create_time DateTime) ENGINE = MergeTree ORDER BY id AS SELECT number, toDateTime(number + 1600000000) from numbers(10000000) settings min_insert_block_size_rows=100000; +CREATE TABLE t2 (delete_time DateTime) ENGINE = MergeTree ORDER BY delete_time AS SELECT toDateTime(number + 1610000000) from numbers(100); + +EXPLAIN indexes=1 SELECT id, delete_time FROM t1 + CROSS JOIN ( + SELECT delete_time + FROM t2 WHERE delete_time > '2023-05-30 00:00:00' +) AS d WHERE create_time < delete_time AND id IN (10000001); + +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; From 03628bde422823922d1fdd52531d34212270edae Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 1 Jun 2023 12:54:34 +0000 Subject: [PATCH 0216/1072] Fix hashing of const integer values --- src/Functions/FunctionsHashing.h | 26 ++++++++++++++++--- .../0_stateless/02534_keyed_siphash.reference | 3 +++ .../0_stateless/02534_keyed_siphash.sql | 9 +++++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h index 9896adca7f3..32e3fbbd4ea 100644 --- a/src/Functions/FunctionsHashing.h +++ b/src/Functions/FunctionsHashing.h @@ -1096,7 +1096,7 @@ private: value = value_reversed; } hash = apply(key, reinterpret_cast(&value), sizeof(value)); - } + } } if constexpr (first) @@ -1110,10 +1110,28 @@ private: auto value = col_from_const->template getValue(); ToType hash; - if constexpr (std::is_same_v) - hash = IntHash64Impl::apply(bit_cast(value)); + if constexpr (Impl::use_int_hash_for_pods) + { + if constexpr (std::is_same_v) + hash = IntHash64Impl::apply(bit_cast(value)); + else + hash = IntHash32Impl::apply(bit_cast(value)); + } else - hash = IntHash32Impl::apply(bit_cast(value)); + { + if constexpr (std::is_same_v) + hash = JavaHashImpl::apply(value); + else + { + if constexpr (std::endian::native == std::endian::big) + { + FromType value_reversed; + reverseMemcpy(&value_reversed, &value, sizeof(value)); + value = value_reversed; + } + hash = apply(key, reinterpret_cast(&value), sizeof(value)); + } + } size_t size = vec_to.size(); if constexpr (first) diff --git a/tests/queries/0_stateless/02534_keyed_siphash.reference b/tests/queries/0_stateless/02534_keyed_siphash.reference index 3606b9a41db..ccc514e7ea2 100644 --- a/tests/queries/0_stateless/02534_keyed_siphash.reference +++ b/tests/queries/0_stateless/02534_keyed_siphash.reference @@ -194,3 +194,6 @@ E28DBDE7FE22E41C 1 E28DBDE7FE22E41C 1 +Check bug with hashing of const integer values +11862823756610506724 +11862823756610506724 diff --git a/tests/queries/0_stateless/02534_keyed_siphash.sql b/tests/queries/0_stateless/02534_keyed_siphash.sql index 9c914f586f0..900b99f548a 100644 --- a/tests/queries/0_stateless/02534_keyed_siphash.sql +++ b/tests/queries/0_stateless/02534_keyed_siphash.sql @@ -272,3 +272,12 @@ select hex(sipHash64()); SELECT hex(sipHash128()) = hex(reverse(unhex('1CE422FEE7BD8DE20000000000000000'))) or hex(sipHash128()) = '1CE422FEE7BD8DE20000000000000000'; select hex(sipHash64Keyed()); SELECT hex(sipHash128Keyed()) = hex(reverse(unhex('1CE422FEE7BD8DE20000000000000000'))) or hex(sipHash128Keyed()) = '1CE422FEE7BD8DE20000000000000000'; + +SELECT 'Check bug with hashing of const integer values'; +DROP TABLE IF EXISTS tab; +CREATE TABLE tab (key Tuple(UInt64, UInt64), val UInt64) ENGINE=Memory; +INSERT INTO tab VALUES ((2, 2), 4); +-- these two statements must produce the same result +SELECT sipHash64Keyed(key, val) FROM tab; +SELECT sipHash64Keyed(key, 4::UInt64) FROM tab; +DROP TABLE tab; From 53eb360ac21c23a7de58e2e483452846619086d7 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 31 May 2023 15:30:41 +0200 Subject: [PATCH 0217/1072] Fix merge_tree_min_rows_for_seek/merge_tree_min_bytes_for_seek for data skipping indexes Signed-off-by: Azat Khuzhin --- .../MergeTree/MergeTreeDataSelectExecutor.cpp | 4 ++-- ...ng_index_merge_tree_min_for_seek.reference | 0 ...skipping_index_merge_tree_min_for_seek.sql | 22 +++++++++++++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 tests/queries/0_stateless/02781_data_skipping_index_merge_tree_min_for_seek.reference create mode 100644 tests/queries/0_stateless/02781_data_skipping_index_merge_tree_min_for_seek.sql diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 2b16ea43179..16b27c2c820 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1729,7 +1729,7 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex( std::max(ranges[i].begin, index_mark * index_granularity), std::min(ranges[i].end, (index_mark + 1) * index_granularity)); - if (res.empty() || res.back().end - data_range.begin > min_marks_for_seek) + if (res.empty() || data_range.begin - res.back().end > min_marks_for_seek) res.push_back(data_range); else res.back().end = data_range.end; @@ -1829,7 +1829,7 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingMergedIndex( std::max(range.begin, index_mark * index_granularity), std::min(range.end, (index_mark + 1) * index_granularity)); - if (res.empty() || res.back().end - data_range.begin > min_marks_for_seek) + if (res.empty() || data_range.begin - res.back().end > min_marks_for_seek) res.push_back(data_range); else res.back().end = data_range.end; diff --git a/tests/queries/0_stateless/02781_data_skipping_index_merge_tree_min_for_seek.reference b/tests/queries/0_stateless/02781_data_skipping_index_merge_tree_min_for_seek.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02781_data_skipping_index_merge_tree_min_for_seek.sql b/tests/queries/0_stateless/02781_data_skipping_index_merge_tree_min_for_seek.sql new file mode 100644 index 00000000000..4cebdde3dfe --- /dev/null +++ b/tests/queries/0_stateless/02781_data_skipping_index_merge_tree_min_for_seek.sql @@ -0,0 +1,22 @@ +-- Tags: no-random-merge-tree-settings, no-random-settings + +DROP TABLE IF EXISTS data; + +CREATE TABLE data +( + key Int, + v1 DateTime, + INDEX v1_index v1 TYPE minmax GRANULARITY 1 +) ENGINE=AggregatingMergeTree() +ORDER BY key +SETTINGS index_granularity=8192; + +SYSTEM STOP MERGES data; + +-- generate 50% of marks that cannot be skipped with v1_index +-- this will create a gap in marks +INSERT INTO data SELECT number, if(number/8192 % 2 == 0, now(), now() - INTERVAL 200 DAY) FROM numbers(1e6); +INSERT INTO data SELECT number+1e6, if(number/8192 % 2 == 0, now(), now() - INTERVAL 200 DAY) FROM numbers(1e6); + +SELECT * FROM data WHERE v1 >= now() - INTERVAL 180 DAY FORMAT Null SETTINGS max_threads=1, max_final_threads=1, force_data_skipping_indices='v1_index', merge_tree_min_rows_for_seek=0, max_rows_to_read=1999999; +SELECT * FROM data WHERE v1 >= now() - INTERVAL 180 DAY FORMAT Null SETTINGS max_threads=1, max_final_threads=1, force_data_skipping_indices='v1_index', merge_tree_min_rows_for_seek=1, max_rows_to_read=1999999; -- { serverError TOO_MANY_ROWS } From 9490cd44acf7bf8a1db59e8e0ed453b1eb85a872 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 1 Jun 2023 13:16:33 +0000 Subject: [PATCH 0218/1072] Include msan fix in protobuf --- contrib/google-protobuf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/google-protobuf b/contrib/google-protobuf index 3b3d8fe1913..c47efe2d8f6 160000 --- a/contrib/google-protobuf +++ b/contrib/google-protobuf @@ -1 +1 @@ -Subproject commit 3b3d8fe191314ea903ea6b072f0e73ef18e15faa +Subproject commit c47efe2d8f6a60022b49ecd6cc23660687c8598f From d57ffec72fb3a52b6d642270f3fc3907bcabbe0b Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Thu, 1 Jun 2023 13:45:00 +0000 Subject: [PATCH 0219/1072] Add signal handler for SIGQUIT --- src/Client/ClientBase.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 77a93a25e9b..29b2eb5ce1e 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -286,7 +286,7 @@ public: static Int32 cancelled_status() { return exit_after_signals.load(); } }; -/// This signal handler is set only for SIGINT. +/// This signal handler is set for SIGINT and SIGQUIT. void interruptSignalHandler(int signum) { if (QueryInterruptHandler::try_stop()) @@ -325,6 +325,9 @@ void ClientBase::setupSignalHandler() if (sigaction(SIGINT, &new_act, nullptr)) throwFromErrno("Cannot set signal handler.", ErrorCodes::CANNOT_SET_SIGNAL_HANDLER); + + if (sigaction(SIGQUIT, &new_act, nullptr)) + throwFromErrno("Cannot set signal handler.", ErrorCodes::CANNOT_SET_SIGNAL_HANDLER); } From 238b0927a918b8a36db60a81dc322b204bb387c5 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Thu, 1 Jun 2023 16:15:42 +0200 Subject: [PATCH 0220/1072] make test easier --- .../01763_filter_push_down_bugs.reference | 21 ++++++++----------- .../01763_filter_push_down_bugs.sql | 11 ++++++---- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/queries/0_stateless/01763_filter_push_down_bugs.reference b/tests/queries/0_stateless/01763_filter_push_down_bugs.reference index eb4e88a1f81..7df35e2948d 100644 --- a/tests/queries/0_stateless/01763_filter_push_down_bugs.reference +++ b/tests/queries/0_stateless/01763_filter_push_down_bugs.reference @@ -15,16 +15,13 @@ Expression ((Projection + Before ORDER BY)) PrimaryKey Keys: id - Condition: (id in 1-element set) - Parts: 0/19 - Granules: 0/1204 + Condition: (id in [101, 101]) + Parts: 1/1 + Granules: 1/1 Expression ((Joined actions + (Rename joined columns + (Projection + Before ORDER BY)))) - Filter (WHERE) - ReadFromMergeTree (default.t2) - Indexes: - PrimaryKey - Keys: - delete_time - Condition: (delete_time in [1685397601, +Inf)) - Parts: 0/1 - Granules: 0/1 + ReadFromMergeTree (default.t2) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 diff --git a/tests/queries/0_stateless/01763_filter_push_down_bugs.sql b/tests/queries/0_stateless/01763_filter_push_down_bugs.sql index 917c350dadb..2ee249b5ce7 100644 --- a/tests/queries/0_stateless/01763_filter_push_down_bugs.sql +++ b/tests/queries/0_stateless/01763_filter_push_down_bugs.sql @@ -46,14 +46,17 @@ DROP TABLE IF EXISTS t; DROP TABLE IF EXISTS t1; DROP TABLE IF EXISTS t2; -CREATE TABLE t1 (id Int64, create_time DateTime) ENGINE = MergeTree ORDER BY id AS SELECT number, toDateTime(number + 1600000000) from numbers(10000000) settings min_insert_block_size_rows=100000; -CREATE TABLE t2 (delete_time DateTime) ENGINE = MergeTree ORDER BY delete_time AS SELECT toDateTime(number + 1610000000) from numbers(100); +CREATE TABLE t1 (id Int64, create_time DateTime) ENGINE = MergeTree ORDER BY id; +CREATE TABLE t2 (delete_time DateTime) ENGINE = MergeTree ORDER BY delete_time; + +insert into t1 values (101, '2023-05-28 00:00:00'), (102, '2023-05-28 00:00:00'); +insert into t2 values ('2023-05-31 00:00:00'); EXPLAIN indexes=1 SELECT id, delete_time FROM t1 CROSS JOIN ( SELECT delete_time - FROM t2 WHERE delete_time > '2023-05-30 00:00:00' -) AS d WHERE create_time < delete_time AND id IN (10000001); + FROM t2 +) AS d WHERE create_time < delete_time AND id = 101; DROP TABLE IF EXISTS t1; DROP TABLE IF EXISTS t2; From a75598ea655e0340ede8a6d46368147378a4072e Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Thu, 1 Jun 2023 16:16:39 +0200 Subject: [PATCH 0221/1072] fix test --- .../test_memory_bound_aggregation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_backward_compatibility/test_memory_bound_aggregation.py b/tests/integration/test_backward_compatibility/test_memory_bound_aggregation.py index 94c788f8f91..d76c4eba409 100644 --- a/tests/integration/test_backward_compatibility/test_memory_bound_aggregation.py +++ b/tests/integration/test_backward_compatibility/test_memory_bound_aggregation.py @@ -74,7 +74,7 @@ def test_backward_compatability(start_cluster): from remote('node{1,2,3}', default, t) group by a limit 1 offset 12345 - settings optimize_aggregation_in_order = 1 + settings optimize_aggregation_in_order = 1, enable_memory_bound_merging_of_aggregation_results = 0 """ ) == "30\n" From 54d526c75c0934b36dd170660ff7222de24a5a13 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 1 Jun 2023 14:22:57 +0000 Subject: [PATCH 0222/1072] Add cast type supprt to DateTimeTransformImpl --- .../functions/type-conversion-functions.md | 85 +++++++++++++++++++ src/Functions/DateTimeTransforms.h | 36 +++++++- src/Functions/FunctionsConversion.h | 18 +++- .../0_stateless/01601_accurate_cast.reference | 4 + .../0_stateless/01601_accurate_cast.sql | 10 +++ 5 files changed, 147 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index c7c66cc771f..a6fc6cd4dfc 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -319,6 +319,49 @@ SELECT ## toDateOrNull ## toDateOrDefault +Converts an input value to [Date](/docs/en/sql-reference/data-types/date.md) data type. +If unsuccessful, returns the lower border value supported by [Date](/docs/en/sql-reference/data-types/date.md). The default value can be specified as a second argument. +Similar to [toDate](#todate). + +**Syntax** + +``` sql +toDateOrDefault(expr [, default_value]) +``` + +**Arguments** + +- `expr` — The value. [String](/docs/en/sql-reference/data-types/string.md), [Int](/docs/en/sql-reference/data-types/int-uint.md), [Date](/docs/en/sql-reference/data-types/date.md) or [DateTime](/docs/en/sql-reference/data-types/datetime.md). +- `default_value` — The default value. [Date](/docs/en/sql-reference/data-types/date.md) + +If `expr` is a number and looks like a UNIX timestamp (is greater than 65535), it is interpreted as a DateTime, then truncated to Date in the current timezone. If `expr` is a number and it is smaller than 65536, it is interpreted as the number of days since 1970-01-01. + +**Returned value** + +- A calendar date. [Date](/docs/en/sql-reference/data-types/date.md) + +**Example** + +Query: + +``` sql +SELECT + toDateOrDefault('2021-01-01', '2023-01-01'::Date), + toDateOrDefault('xx2021-01-01', '2023-01-01'::Date); +``` + +Result: + +```response +┌─toDateOrDefault('2021-01-01', CAST('2023-01-01', 'Date'))─┬─toDateOrDefault('xx2021-01-01', CAST('2023-01-01', 'Date'))─┐ +│ 2021-01-01 │ 2023-01-01 │ +└───────────────────────────────────────────────────────────┴─────────────────────────────────────────────────────────────┘ +``` + +**See Also** +- [toDate](#todate) +- [toDate32OrDefault](#todate32ordefault) + ## toDateTime @@ -327,6 +370,48 @@ SELECT ## toDateTimeOrNull ## toDateTimeOrDefault +Converts an input value to [DateTime](/docs/en/sql-reference/data-types/datetime.md) data type. +If unsuccessful, returns the lower border value supported by [DateTime](/docs/en/sql-reference/data-types/datetime.md). The default value can be specified as a third argument. +Similar to [toDateTime](#todatetime). + +**Syntax** + +``` sql +toDateTimeOrDefault(expr, [, time_zone [, default_value]]) +``` + +**Arguments** + +- `expr` — The value. [String](/docs/en/sql-reference/data-types/string.md), [Int](/docs/en/sql-reference/data-types/int-uint.md), [Date](/docs/en/sql-reference/data-types/date.md) or [DateTime](/docs/en/sql-reference/data-types/datetime.md). +- `time_zone` — Time zone. +- `default_value` — The default value. [DateTime](/docs/en/sql-reference/data-types/datetime.md) + +If `expr` is a number, it is interpreted as the number of seconds since the beginning of the Unix Epoch (as Unix timestamp). + +**Returned value** + +- A date time. [DateTime](/docs/en/sql-reference/data-types/datetime.md) + +**Example** + +Query: + +``` sql +SELECT + toDateTimeOrDefault('2021-01-01', 'UTC', '2023-01-01'::DateTime('UTC')), + toDateTimeOrDefault('xx2021-01-01', 'UTC', '2023-01-01'::DateTime('UTC')); +``` + +Result: + +```response +┌─toDateTimeOrDefault('2021-01-01', 'UTC', CAST('2023-01-01', 'DateTime(\'UTC\')'))─┬─toDateTimeOrDefault('xx2021-01-01', 'UTC', CAST('2023-01-01', 'DateTime(\'UTC\')'))─┐ +│ 2021-01-01 00:00:00 │ 2023-01-01 00:00:00 │ +└───────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────────────────────────────────────────────────┘ +``` + +**See Also** +- [toDateTime](#todatetime) ## toDate32 diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index f179d9fbe60..81b1ec2e356 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -21,6 +21,7 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_COLUMN; + extern const int CANNOT_CONVERT_TYPE; } /** Transformations. @@ -1425,8 +1426,10 @@ struct ToDateTimeComponentsImpl using FactorTransform = ZeroTransform; }; +struct DateTimeAccurateConvertStrategyAdditions {}; +struct DateTimeAccurateOrNullConvertStrategyAdditions {}; -template +template struct Transformer { template @@ -1438,6 +1441,33 @@ struct Transformer for (size_t i = 0; i < size; ++i) { + constexpr bool transformHasExtraCheck = requires(const Transform& t) + { + t.ExtraCheck(vec_from[i], time_zone); + }; + + if constexpr (transformHasExtraCheck) + { + // if constexpr (std::is_same_v + // || std::is_same_v) + { + bool checked = transform.ExtraCheck(vec_from[i], time_zone); + if (!checked) + { + if (std::is_same_v) + { + // vec_to[i] = 0; + // (*vec_null_map_to)[i] = true; + } + else + { + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Value in column {} cannot be safely converted into type {}", + TypeName, TypeName); + } + } + } + } + if constexpr (is_extended_result) vec_to[i] = static_cast(transform.executeExtendedResult(vec_from[i], time_zone)); else @@ -1446,14 +1476,14 @@ struct Transformer } }; - template struct DateTimeTransformImpl { + template static ColumnPtr execute( const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/, const Transform & transform = {}) { - using Op = Transformer; + using Op = Transformer; const ColumnPtr source_col = arguments[0].column; if (const auto * sources = checkAndGetColumn(source_col.get())) diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index d3676349318..d3ccbb82721 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -400,7 +400,11 @@ template struct ToDateTransform8Or16Signed { static constexpr auto name = "toDate"; - + static NO_SANITIZE_UNDEFINED bool ExtraCheck(const FromType & from, const DateLUTImpl &) + { + return from >= 0; + } + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) { if (from < 0) @@ -2884,8 +2888,16 @@ private: if constexpr (IsDataTypeNumber && IsDataTypeDateOrDateTime) { - result_column = ConvertImpl::execute( - arguments, result_type, input_rows_count); + if (wrapper_cast_type == CastType::accurate) + { + result_column = ConvertImpl::template execute( + arguments, result_type, input_rows_count); + } + else + { + result_column = ConvertImpl::template execute( + arguments, result_type, input_rows_count); + } return true; } diff --git a/tests/queries/0_stateless/01601_accurate_cast.reference b/tests/queries/0_stateless/01601_accurate_cast.reference index c1e7feffbe6..b662319d263 100644 --- a/tests/queries/0_stateless/01601_accurate_cast.reference +++ b/tests/queries/0_stateless/01601_accurate_cast.reference @@ -6,3 +6,7 @@ 5 1 12 +2023-05-30 14:38:20 +1970-01-01 00:00:19 +2023-05-30 +1970-01-20 \ No newline at end of file diff --git a/tests/queries/0_stateless/01601_accurate_cast.sql b/tests/queries/0_stateless/01601_accurate_cast.sql index b5fd4fb04a4..1ab98e26d1a 100644 --- a/tests/queries/0_stateless/01601_accurate_cast.sql +++ b/tests/queries/0_stateless/01601_accurate_cast.sql @@ -22,3 +22,13 @@ SELECT accurateCast(-10, 'Decimal32(9)'); -- { serverError 407 } SELECT accurateCast('123', 'FixedString(2)'); -- { serverError 131 } SELECT accurateCast('12', 'FixedString(2)'); + +SELECT accurateCast(-1, 'DateTime'); -- { serverError 70 } +SELECT accurateCast('1xxx', 'DateTime'); -- { serverError 41 } +SELECT accurateCast('2023-05-30 14:38:20', 'DateTime'); +SELECT accurateCast(19, 'DateTime'); + +SELECT accurateCast(-1, 'Date'); -- { serverError 70 } +SELECT accurateCast('1xxx', 'Date'); -- { serverError 70 } +SELECT accurateCast('2023-05-30', 'Date'); +SELECT accurateCast(19, 'Date'); From a4cb82127dfe488fb1e1ad90aebaccf469ad742e Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 22 May 2023 13:54:33 +0200 Subject: [PATCH 0223/1072] Analyzer: WIP on distributed queries --- src/Analyzer/ColumnNode.cpp | 5 ++++- src/Analyzer/TableNode.cpp | 2 ++ src/Planner/PlannerContext.cpp | 14 +++++++------- src/Planner/PlannerJoinTree.cpp | 3 ++- src/Planner/Utils.cpp | 2 +- src/Storages/StorageDistributed.cpp | 6 ++++-- 6 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/Analyzer/ColumnNode.cpp b/src/Analyzer/ColumnNode.cpp index a9d47f8287d..f020040ea78 100644 --- a/src/Analyzer/ColumnNode.cpp +++ b/src/Analyzer/ColumnNode.cpp @@ -117,7 +117,10 @@ ASTPtr ColumnNode::toASTImpl(const ConvertToASTOptions & options) const else { const auto & table_storage_id = table_node->getStorageID(); - column_identifier_parts = { table_storage_id.getDatabaseName(), table_storage_id.getTableName() }; + if (table_storage_id.hasDatabase()) + column_identifier_parts = { table_storage_id.getDatabaseName(), table_storage_id.getTableName() }; + else + column_identifier_parts = { table_storage_id.getTableName() }; } } } diff --git a/src/Analyzer/TableNode.cpp b/src/Analyzer/TableNode.cpp index c86cbcd5a80..17d12bd6afa 100644 --- a/src/Analyzer/TableNode.cpp +++ b/src/Analyzer/TableNode.cpp @@ -91,6 +91,8 @@ ASTPtr TableNode::toASTImpl(const ConvertToASTOptions & /* options */) const if (!temporary_table_name.empty()) return std::make_shared(temporary_table_name); + if (!storage_id.hasDatabase()) + return std::make_shared(storage_id.getTableName()); return std::make_shared(storage_id.getDatabaseName(), storage_id.getTableName()); } diff --git a/src/Planner/PlannerContext.cpp b/src/Planner/PlannerContext.cpp index 346cc6d2080..7ab8180eb9c 100644 --- a/src/Planner/PlannerContext.cpp +++ b/src/Planner/PlannerContext.cpp @@ -19,17 +19,17 @@ const ColumnIdentifier & GlobalPlannerContext::createColumnIdentifier(const Quer return createColumnIdentifier(column_node_typed.getColumn(), column_source_node); } -const ColumnIdentifier & GlobalPlannerContext::createColumnIdentifier(const NameAndTypePair & column, const QueryTreeNodePtr & column_source_node) +const ColumnIdentifier & GlobalPlannerContext::createColumnIdentifier(const NameAndTypePair & column, const QueryTreeNodePtr & /*column_source_node*/) { std::string column_identifier; - if (column_source_node->hasAlias()) - column_identifier += column_source_node->getAlias(); - else if (const auto * table_source_node = column_source_node->as()) - column_identifier += table_source_node->getStorageID().getFullNameNotQuoted(); + // if (column_source_node->hasAlias()) + // column_identifier += column_source_node->getAlias(); + // else if (const auto * table_source_node = column_source_node->as()) + // column_identifier += table_source_node->getStorageID().getFullNameNotQuoted(); - if (!column_identifier.empty()) - column_identifier += '.'; + // if (!column_identifier.empty()) + // column_identifier += '.'; column_identifier += column.name; column_identifier += '_' + std::to_string(column_identifiers.size()); diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index 4f091f73187..4a1708f96d3 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -106,7 +106,8 @@ void checkAccessRights(const TableNode & table_node, const Names & column_names, storage_id.getFullTableName()); } - query_context->checkAccess(AccessType::SELECT, storage_id, column_names); + if (storage_id.hasDatabase()) + query_context->checkAccess(AccessType::SELECT, storage_id, column_names); } NameAndTypePair chooseSmallestColumnToReadFromStorage(const StoragePtr & storage, const StorageSnapshotPtr & storage_snapshot) diff --git a/src/Planner/Utils.cpp b/src/Planner/Utils.cpp index 5c61b2fc2c7..cd4fb9182e9 100644 --- a/src/Planner/Utils.cpp +++ b/src/Planner/Utils.cpp @@ -107,7 +107,7 @@ Block buildCommonHeaderForUnion(const Blocks & queries_headers, SelectUnionMode ASTPtr queryNodeToSelectQuery(const QueryTreeNodePtr & query_node) { auto & query_node_typed = query_node->as(); - auto result_ast = query_node_typed.toAST(); + auto result_ast = query_node_typed.toAST({ .fully_qualified_identifiers = false }); while (true) { diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index bcf6f68d00d..e0bb14c62fd 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -943,7 +943,9 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, } else { - auto resolved_remote_storage_id = query_context->resolveStorageID(remote_storage_id); + auto resolved_remote_storage_id = remote_storage_id; + if (remote_storage_id.hasDatabase()) + resolved_remote_storage_id = query_context->resolveStorageID(remote_storage_id); auto storage = std::make_shared(resolved_remote_storage_id, distributed_storage_snapshot->metadata->getColumns()); auto table_node = std::make_shared(std::move(storage), query_context); @@ -1059,7 +1061,7 @@ void StorageDistributed::read( remote_table_function_ptr); query_ast = queryNodeToSelectQuery(query_tree_distributed); - header = InterpreterSelectQueryAnalyzer::getSampleBlock(query_ast, local_context, SelectQueryOptions(processed_stage).analyze()); + header = InterpreterSelectQueryAnalyzer::getSampleBlock(query_tree_distributed, local_context, SelectQueryOptions(processed_stage).analyze()); } else { From 70c1b1de57252ace05fe69b3a791b3fe4f025273 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 22 May 2023 15:43:30 +0000 Subject: [PATCH 0224/1072] Remove code --- src/Planner/PlannerContext.cpp | 8 -- ...ns_optimize_read_in_window_order.reference | 10 +- .../02227_union_match_by_name.reference | 4 +- .../0_stateless/02303_query_kind.reference | 16 +-- .../02381_join_dup_columns_in_plan.reference | 44 +++--- .../0_stateless/02421_explain_subquery.sql | 2 +- .../02451_order_by_monotonic.reference | 32 ++--- .../02481_aggregation_in_order_plan.reference | 2 +- .../02514_analyzer_drop_join_on.reference | 134 +++++++++--------- ...dicate_push_down_filled_join_fix.reference | 30 ++-- ...n_merge_tree_prewhere_row_policy.reference | 8 +- 11 files changed, 141 insertions(+), 149 deletions(-) diff --git a/src/Planner/PlannerContext.cpp b/src/Planner/PlannerContext.cpp index 7ab8180eb9c..a788a6cbc3c 100644 --- a/src/Planner/PlannerContext.cpp +++ b/src/Planner/PlannerContext.cpp @@ -23,14 +23,6 @@ const ColumnIdentifier & GlobalPlannerContext::createColumnIdentifier(const Name { std::string column_identifier; - // if (column_source_node->hasAlias()) - // column_identifier += column_source_node->getAlias(); - // else if (const auto * table_source_node = column_source_node->as()) - // column_identifier += table_source_node->getStorageID().getFullNameNotQuoted(); - - // if (!column_identifier.empty()) - // column_identifier += '.'; - column_identifier += column.name; column_identifier += '_' + std::to_string(column_identifiers.size()); diff --git a/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.reference b/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.reference index f23cf03913b..8a33df9fad2 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.reference @@ -7,19 +7,19 @@ Partial sorting plan Prefix sort description: n ASC Result sort description: n ASC, x ASC optimize_read_in_window_order=1, allow_experimental_analyzer=1 - Prefix sort description: default.test_01655_plan_optimizations_optimize_read_in_window_order_n.n_0 ASC - Result sort description: default.test_01655_plan_optimizations_optimize_read_in_window_order_n.n_0 ASC, default.test_01655_plan_optimizations_optimize_read_in_window_order_n.x_1 ASC + Prefix sort description: n_0 ASC + Result sort description: n_0 ASC, x_1 ASC No sorting plan optimize_read_in_window_order=0 Sort description: n ASC, x ASC optimize_read_in_window_order=0, allow_experimental_analyzer=1 - Sort description: default.test_01655_plan_optimizations_optimize_read_in_window_order_n_x.n_0 ASC, default.test_01655_plan_optimizations_optimize_read_in_window_order_n_x.x_1 ASC + Sort description: n_0 ASC, x_1 ASC optimize_read_in_window_order=1 Prefix sort description: n ASC, x ASC Result sort description: n ASC, x ASC optimize_read_in_window_order=1, allow_experimental_analyzer=1 - Prefix sort description: default.test_01655_plan_optimizations_optimize_read_in_window_order_n_x.n_0 ASC, default.test_01655_plan_optimizations_optimize_read_in_window_order_n_x.x_1 ASC - Result sort description: default.test_01655_plan_optimizations_optimize_read_in_window_order_n_x.n_0 ASC, default.test_01655_plan_optimizations_optimize_read_in_window_order_n_x.x_1 ASC + Prefix sort description: n_0 ASC, x_1 ASC + Result sort description: n_0 ASC, x_1 ASC Complex ORDER BY optimize_read_in_window_order=0 3 3 1 diff --git a/tests/queries/0_stateless/02227_union_match_by_name.reference b/tests/queries/0_stateless/02227_union_match_by_name.reference index e51ea983f7f..685b3c83b05 100644 --- a/tests/queries/0_stateless/02227_union_match_by_name.reference +++ b/tests/queries/0_stateless/02227_union_match_by_name.reference @@ -30,7 +30,7 @@ Header: avgWeighted(x, y) Nullable(Float64) 255_UInt8 UInt8 1_UInt8 UInt8 Expression (Change column names to column identifiers) - Header: system.one.dummy_0 UInt8 + Header: dummy_0 UInt8 ReadFromStorage (SystemOne) Header: dummy UInt8 Expression (Conversion before UNION) @@ -46,7 +46,7 @@ Header: avgWeighted(x, y) Nullable(Float64) NULL_Nullable(Nothing) Nullable(Nothing) 1_UInt8 UInt8 Expression (Change column names to column identifiers) - Header: system.one.dummy_0 UInt8 + Header: dummy_0 UInt8 ReadFromStorage (SystemOne) Header: dummy UInt8 SELECT avgWeighted(x, y) FROM (SELECT NULL, 255 AS x, 1 AS y UNION ALL SELECT y, NULL AS x, 1 AS y); diff --git a/tests/queries/0_stateless/02303_query_kind.reference b/tests/queries/0_stateless/02303_query_kind.reference index 5af8c2b743f..8d119fb22b2 100644 --- a/tests/queries/0_stateless/02303_query_kind.reference +++ b/tests/queries/0_stateless/02303_query_kind.reference @@ -2,35 +2,35 @@ clickhouse-client --allow_experimental_analyzer=1 --query_kind secondary_query - Expression ((Project names + Projection)) Header: dummy String Aggregating - Header: toString(system.one.dummy_0) String + Header: toString(dummy_0) String Expression ((Before GROUP BY + Change column names to column identifiers)) - Header: toString(system.one.dummy_0) String + Header: toString(dummy_0) String ReadFromStorage (SystemOne) Header: dummy UInt8 clickhouse-local --allow_experimental_analyzer=1 --query_kind secondary_query -q explain plan header=1 select toString(dummy) as dummy from system.one group by dummy Expression ((Project names + Projection)) Header: dummy String Aggregating - Header: toString(system.one.dummy_0) String + Header: toString(dummy_0) String Expression ((Before GROUP BY + Change column names to column identifiers)) - Header: toString(system.one.dummy_0) String + Header: toString(dummy_0) String ReadFromStorage (SystemOne) Header: dummy UInt8 clickhouse-client --allow_experimental_analyzer=1 --query_kind initial_query -q explain plan header=1 select toString(dummy) as dummy from system.one group by dummy Expression ((Project names + Projection)) Header: dummy String Aggregating - Header: toString(system.one.dummy_0) String + Header: toString(dummy_0) String Expression ((Before GROUP BY + Change column names to column identifiers)) - Header: toString(system.one.dummy_0) String + Header: toString(dummy_0) String ReadFromStorage (SystemOne) Header: dummy UInt8 clickhouse-local --allow_experimental_analyzer=1 --query_kind initial_query -q explain plan header=1 select toString(dummy) as dummy from system.one group by dummy Expression ((Project names + Projection)) Header: dummy String Aggregating - Header: toString(system.one.dummy_0) String + Header: toString(dummy_0) String Expression ((Before GROUP BY + Change column names to column identifiers)) - Header: toString(system.one.dummy_0) String + Header: toString(dummy_0) String ReadFromStorage (SystemOne) Header: dummy UInt8 diff --git a/tests/queries/0_stateless/02381_join_dup_columns_in_plan.reference b/tests/queries/0_stateless/02381_join_dup_columns_in_plan.reference index 31a37862663..dd5c9d4616e 100644 --- a/tests/queries/0_stateless/02381_join_dup_columns_in_plan.reference +++ b/tests/queries/0_stateless/02381_join_dup_columns_in_plan.reference @@ -2,51 +2,51 @@ Expression Header: key String value String Join - Header: s1.key_0 String - s2.value_1 String + Header: key_0 String + value_1 String Expression - Header: s1.key_0 String + Header: key_0 String ReadFromStorage Header: dummy UInt8 Union - Header: s2.key_2 String - s2.value_1 String + Header: key_2 String + value_1 String Expression - Header: s2.key_2 String - s2.value_1 String + Header: key_2 String + value_1 String ReadFromStorage Header: dummy UInt8 Expression - Header: s2.key_2 String - s2.value_1 String + Header: key_2 String + value_1 String ReadFromStorage Header: dummy UInt8 Expression Header: key String value String Join - Header: s1.key_0 String - s2.key_2 String - s2.value_1 String + Header: key_0 String + key_2 String + value_1 String Sorting - Header: s1.key_0 String + Header: key_0 String Expression - Header: s1.key_0 String + Header: key_0 String ReadFromStorage Header: dummy UInt8 Sorting - Header: s2.key_2 String - s2.value_1 String + Header: key_2 String + value_1 String Union - Header: s2.key_2 String - s2.value_1 String + Header: key_2 String + value_1 String Expression - Header: s2.key_2 String - s2.value_1 String + Header: key_2 String + value_1 String ReadFromStorage Header: dummy UInt8 Expression - Header: s2.key_2 String - s2.value_1 String + Header: key_2 String + value_1 String ReadFromStorage Header: dummy UInt8 diff --git a/tests/queries/0_stateless/02421_explain_subquery.sql b/tests/queries/0_stateless/02421_explain_subquery.sql index 32631b54d0c..4b970f81219 100644 --- a/tests/queries/0_stateless/02421_explain_subquery.sql +++ b/tests/queries/0_stateless/02421_explain_subquery.sql @@ -34,7 +34,7 @@ DROP TABLE t1; SET allow_experimental_analyzer = 1; -SELECT count() > 3 FROM (EXPLAIN PIPELINE header = 1 SELECT * FROM system.numbers ORDER BY number DESC) WHERE explain LIKE '%Header: system.numbers.number__ UInt64%'; +SELECT count() > 3 FROM (EXPLAIN PIPELINE header = 1 SELECT * FROM system.numbers ORDER BY number DESC) WHERE explain LIKE '%Header: number__ UInt64%'; SELECT count() > 0 FROM (EXPLAIN PLAN SELECT * FROM system.numbers ORDER BY number DESC) WHERE explain ILIKE '%Sort%'; SELECT count() > 0 FROM (EXPLAIN SELECT * FROM system.numbers ORDER BY number DESC) WHERE explain ILIKE '%Sort%'; SELECT count() > 0 FROM (EXPLAIN CURRENT TRANSACTION); diff --git a/tests/queries/0_stateless/02451_order_by_monotonic.reference b/tests/queries/0_stateless/02451_order_by_monotonic.reference index f9f0ef38be1..05f20a9bad8 100644 --- a/tests/queries/0_stateless/02451_order_by_monotonic.reference +++ b/tests/queries/0_stateless/02451_order_by_monotonic.reference @@ -4,19 +4,19 @@ 2022-09-09 12:00:00 0x 2022-09-09 12:00:00 1 2022-09-09 12:00:00 1x - Prefix sort description: toStartOfMinute(test.t_0) ASC - Result sort description: toStartOfMinute(test.t_0) ASC, test.c1_1 ASC - Prefix sort description: toStartOfMinute(test.t_0) ASC - Result sort description: toStartOfMinute(test.t_0) ASC - Prefix sort description: negate(test.a_0) ASC - Result sort description: negate(test.a_0) ASC - Prefix sort description: negate(test.a_0) ASC, negate(test.b_1) ASC - Result sort description: negate(test.a_0) ASC, negate(test.b_1) ASC - Prefix sort description: test.a_0 DESC, negate(test.b_1) ASC - Result sort description: test.a_0 DESC, negate(test.b_1) ASC - Prefix sort description: negate(test.a_0) ASC, test.b_1 DESC - Result sort description: negate(test.a_0) ASC, test.b_1 DESC - Prefix sort description: negate(test.a_0) ASC - Result sort description: negate(test.a_0) ASC, test.b_1 ASC - Prefix sort description: test.a_0 ASC - Result sort description: test.a_0 ASC, negate(test.b_1) ASC + Prefix sort description: toStartOfMinute(t_0) ASC + Result sort description: toStartOfMinute(t_0) ASC, c1_1 ASC + Prefix sort description: toStartOfMinute(t_0) ASC + Result sort description: toStartOfMinute(t_0) ASC + Prefix sort description: negate(a_0) ASC + Result sort description: negate(a_0) ASC + Prefix sort description: negate(a_0) ASC, negate(b_1) ASC + Result sort description: negate(a_0) ASC, negate(b_1) ASC + Prefix sort description: a_0 DESC, negate(b_1) ASC + Result sort description: a_0 DESC, negate(b_1) ASC + Prefix sort description: negate(a_0) ASC, b_1 DESC + Result sort description: negate(a_0) ASC, b_1 DESC + Prefix sort description: negate(a_0) ASC + Result sort description: negate(a_0) ASC, b_1 ASC + Prefix sort description: a_0 ASC + Result sort description: a_0 ASC, negate(b_1) ASC diff --git a/tests/queries/0_stateless/02481_aggregation_in_order_plan.reference b/tests/queries/0_stateless/02481_aggregation_in_order_plan.reference index bb4eb4ddd75..b11f3e3a1d3 100644 --- a/tests/queries/0_stateless/02481_aggregation_in_order_plan.reference +++ b/tests/queries/0_stateless/02481_aggregation_in_order_plan.reference @@ -6,5 +6,5 @@ Order: a ASC, c ASC ReadFromMergeTree (default.tab) Aggregating - Order: default.tab.a_0 ASC, default.tab.c_2 ASC + Order: a_0 ASC, c_2 ASC ReadFromMergeTree (default.tab) diff --git a/tests/queries/0_stateless/02514_analyzer_drop_join_on.reference b/tests/queries/0_stateless/02514_analyzer_drop_join_on.reference index 8b4cafc3260..0037ab85c07 100644 --- a/tests/queries/0_stateless/02514_analyzer_drop_join_on.reference +++ b/tests/queries/0_stateless/02514_analyzer_drop_join_on.reference @@ -6,43 +6,43 @@ SELECT count() FROM a JOIN b ON b.b1 = a.a1 JOIN c ON c.c1 = b.b1 JOIN d ON d.d1 Expression ((Project names + Projection)) Header: count() UInt64 Aggregating - Header: default.a.a2_4 String + Header: a2_4 String count() UInt64 Expression ((Before GROUP BY + DROP unused columns after JOIN)) - Header: default.a.a2_4 String + Header: a2_4 String Join (JOIN FillRightFirst) - Header: default.a.a2_4 String - default.c.c1_2 UInt64 + Header: a2_4 String + c1_2 UInt64 Expression ((JOIN actions + DROP unused columns after JOIN)) - Header: default.a.a2_4 String - default.c.c1_2 UInt64 + Header: a2_4 String + c1_2 UInt64 Join (JOIN FillRightFirst) - Header: default.a.a2_4 String - default.b.b1_0 UInt64 - default.c.c1_2 UInt64 + Header: a2_4 String + b1_0 UInt64 + c1_2 UInt64 Expression ((JOIN actions + DROP unused columns after JOIN)) - Header: default.a.a2_4 String - default.b.b1_0 UInt64 + Header: a2_4 String + b1_0 UInt64 Join (JOIN FillRightFirst) - Header: default.a.a1_1 UInt64 - default.a.a2_4 String - default.b.b1_0 UInt64 + Header: a1_1 UInt64 + a2_4 String + b1_0 UInt64 Expression ((JOIN actions + Change column names to column identifiers)) - Header: default.a.a1_1 UInt64 - default.a.a2_4 String + Header: a1_1 UInt64 + a2_4 String ReadFromMemoryStorage Header: a1 UInt64 a2 String Expression ((JOIN actions + Change column names to column identifiers)) - Header: default.b.b1_0 UInt64 + Header: b1_0 UInt64 ReadFromMemoryStorage Header: b1 UInt64 Expression ((JOIN actions + Change column names to column identifiers)) - Header: default.c.c1_2 UInt64 + Header: c1_2 UInt64 ReadFromMemoryStorage Header: c1 UInt64 Expression ((JOIN actions + Change column names to column identifiers)) - Header: default.d.d1_3 UInt64 + Header: d1_3 UInt64 ReadFromMemoryStorage Header: d1 UInt64 EXPLAIN PLAN header = 1 @@ -52,38 +52,38 @@ Expression ((Project names + (Projection + DROP unused columns after JOIN))) Header: a2 String d2 String Join (JOIN FillRightFirst) - Header: default.a.a2_0 String - default.a.k_2 UInt64 - default.d.d2_1 String + Header: a2_0 String + k_2 UInt64 + d2_1 String Expression (DROP unused columns after JOIN) - Header: default.a.a2_0 String - default.a.k_2 UInt64 + Header: a2_0 String + k_2 UInt64 Join (JOIN FillRightFirst) - Header: default.a.a2_0 String - default.a.k_2 UInt64 + Header: a2_0 String + k_2 UInt64 Expression (DROP unused columns after JOIN) - Header: default.a.a2_0 String - default.a.k_2 UInt64 + Header: a2_0 String + k_2 UInt64 Join (JOIN FillRightFirst) - Header: default.a.a2_0 String - default.a.k_2 UInt64 + Header: a2_0 String + k_2 UInt64 Expression (Change column names to column identifiers) - Header: default.a.a2_0 String - default.a.k_2 UInt64 + Header: a2_0 String + k_2 UInt64 ReadFromMemoryStorage Header: a2 String k UInt64 Expression (Change column names to column identifiers) - Header: default.b.k_3 UInt64 + Header: k_3 UInt64 ReadFromMemoryStorage Header: k UInt64 Expression (Change column names to column identifiers) - Header: default.c.k_4 UInt64 + Header: k_4 UInt64 ReadFromMemoryStorage Header: k UInt64 Expression (Change column names to column identifiers) - Header: default.d.d2_1 String - default.d.k_5 UInt64 + Header: d2_1 String + k_5 UInt64 ReadFromMemoryStorage Header: d2 String k UInt64 @@ -97,55 +97,55 @@ WHERE c.c2 != '' ORDER BY a.a2 Expression (Project names) Header: bx String Sorting (Sorting for ORDER BY) - Header: default.a.a2_6 String - b.bx_0 String + Header: a2_6 String + bx_0 String Expression ((Before ORDER BY + (Projection + ))) - Header: default.a.a2_6 String - b.bx_0 String + Header: a2_6 String + bx_0 String Join (JOIN FillRightFirst) - Header: default.a.a2_6 String - b.bx_0 String - default.c.c2_5 String - default.c.c1_3 UInt64 + Header: a2_6 String + bx_0 String + c2_5 String + c1_3 UInt64 Filter (( + (JOIN actions + DROP unused columns after JOIN))) - Header: default.a.a2_6 String - b.bx_0 String - default.c.c2_5 String - default.c.c1_3 UInt64 + Header: a2_6 String + bx_0 String + c2_5 String + c1_3 UInt64 Join (JOIN FillRightFirst) - Header: default.a.a2_6 String - b.bx_0 String - b.b1_1 UInt64 - default.c.c2_5 String - default.c.c1_3 UInt64 + Header: a2_6 String + bx_0 String + b1_1 UInt64 + c2_5 String + c1_3 UInt64 Expression ((JOIN actions + DROP unused columns after JOIN)) - Header: default.a.a2_6 String - b.bx_0 String - b.b1_1 UInt64 + Header: a2_6 String + bx_0 String + b1_1 UInt64 Join (JOIN FillRightFirst) - Header: default.a.a1_2 UInt64 - default.a.a2_6 String - b.bx_0 String - b.b1_1 UInt64 + Header: a1_2 UInt64 + a2_6 String + bx_0 String + b1_1 UInt64 Expression ((JOIN actions + Change column names to column identifiers)) - Header: default.a.a1_2 UInt64 - default.a.a2_6 String + Header: a1_2 UInt64 + a2_6 String ReadFromMemoryStorage Header: a1 UInt64 a2 String Expression ((JOIN actions + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))) - Header: b.b1_1 UInt64 - b.bx_0 String + Header: b1_1 UInt64 + bx_0 String ReadFromMemoryStorage Header: b1 UInt64 b2 String Expression ((JOIN actions + Change column names to column identifiers)) - Header: default.c.c1_3 UInt64 - default.c.c2_5 String + Header: c1_3 UInt64 + c2_5 String ReadFromMemoryStorage Header: c1 UInt64 c2 String Expression ((JOIN actions + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))) - Header: d.d1_4 UInt64 + Header: d1_4 UInt64 ReadFromStorage (SystemNumbers) Header: number UInt64 diff --git a/tests/queries/0_stateless/02675_predicate_push_down_filled_join_fix.reference b/tests/queries/0_stateless/02675_predicate_push_down_filled_join_fix.reference index ecdb62c5cb5..986ecffcdf8 100644 --- a/tests/queries/0_stateless/02675_predicate_push_down_filled_join_fix.reference +++ b/tests/queries/0_stateless/02675_predicate_push_down_filled_join_fix.reference @@ -2,27 +2,27 @@ Expression ((Project names + (Projection + ))) Header: t1.id UInt64 t1.value String t2.value String -Actions: INPUT : 0 -> t1.id_0 UInt64 : 0 - INPUT : 1 -> t1.value_1 String : 1 - INPUT : 2 -> t2.value_2 String : 2 - ALIAS t1.id_0 :: 0 -> t1.id UInt64 : 3 - ALIAS t1.value_1 :: 1 -> t1.value String : 0 - ALIAS t2.value_2 :: 2 -> t2.value String : 1 +Actions: INPUT : 0 -> id_0 UInt64 : 0 + INPUT : 1 -> value_1 String : 1 + INPUT : 2 -> value_2 String : 2 + ALIAS id_0 :: 0 -> t1.id UInt64 : 3 + ALIAS value_1 :: 1 -> t1.value String : 0 + ALIAS value_2 :: 2 -> t2.value String : 1 Positions: 3 0 1 FilledJoin (Filled JOIN) - Header: t1.id_0 UInt64 - t1.value_1 String - t2.value_2 String + Header: id_0 UInt64 + value_1 String + value_2 String Filter (( + (JOIN actions + Change column names to column identifiers))) - Header: t1.id_0 UInt64 - t1.value_1 String - Filter column: equals(t1.id_0, 0_UInt8) (removed) + Header: id_0 UInt64 + value_1 String + Filter column: equals(id_0, 0_UInt8) (removed) Actions: INPUT : 0 -> id UInt64 : 0 INPUT : 1 -> value String : 1 COLUMN Const(UInt8) -> 0_UInt8 UInt8 : 2 - ALIAS id :: 0 -> t1.id_0 UInt64 : 3 - ALIAS value :: 1 -> t1.value_1 String : 0 - FUNCTION equals(t1.id_0 : 3, 0_UInt8 :: 2) -> equals(t1.id_0, 0_UInt8) UInt8 : 1 + ALIAS id :: 0 -> id_0 UInt64 : 3 + ALIAS value :: 1 -> value_1 String : 0 + FUNCTION equals(id_0 : 3, 0_UInt8 :: 2) -> equals(id_0, 0_UInt8) UInt8 : 1 Positions: 1 3 0 ReadFromMergeTree (default.test_table) Header: id UInt64 diff --git a/tests/queries/0_stateless/02679_explain_merge_tree_prewhere_row_policy.reference b/tests/queries/0_stateless/02679_explain_merge_tree_prewhere_row_policy.reference index 2fe98ea1682..cc16a1fce02 100644 --- a/tests/queries/0_stateless/02679_explain_merge_tree_prewhere_row_policy.reference +++ b/tests/queries/0_stateless/02679_explain_merge_tree_prewhere_row_policy.reference @@ -29,10 +29,10 @@ Header: id UInt64 value String Actions: INPUT : 0 -> id UInt64 : 0 INPUT : 1 -> value String : 1 - ALIAS id :: 0 -> default.test_table.id_0 UInt64 : 2 - ALIAS value :: 1 -> default.test_table.value_1 String : 0 - ALIAS default.test_table.id_0 :: 2 -> id UInt64 : 1 - ALIAS default.test_table.value_1 :: 0 -> value String : 2 + ALIAS id :: 0 -> id_0 UInt64 : 2 + ALIAS value :: 1 -> value_1 String : 0 + ALIAS id_0 :: 2 -> id UInt64 : 1 + ALIAS value_1 :: 0 -> value String : 2 Positions: 1 2 ReadFromMergeTree (default.test_table) Header: id UInt64 From 2c878581bf0642b7e7d8b3c63ac483f6f102639d Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 22 May 2023 17:20:53 +0000 Subject: [PATCH 0225/1072] Update 02377_optimize_sorting_by_input_stream_properties_explain reference --- ..._input_stream_properties_explain.reference | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/queries/0_stateless/02377_optimize_sorting_by_input_stream_properties_explain.reference b/tests/queries/0_stateless/02377_optimize_sorting_by_input_stream_properties_explain.reference index 69c325c21a9..5c9e39805b7 100644 --- a/tests/queries/0_stateless/02377_optimize_sorting_by_input_stream_properties_explain.reference +++ b/tests/queries/0_stateless/02377_optimize_sorting_by_input_stream_properties_explain.reference @@ -8,7 +8,7 @@ Sorting (None) -- QUERY (analyzer): set optimize_read_in_order=1;set max_threads=3;set query_plan_remove_redundant_sorting=0;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a FROM optimize_sorting ORDER BY a Sorting (Global): a ASC Sorting (Sorting for ORDER BY) -Sorting (Global): default.optimize_sorting.a_0 ASC +Sorting (Global): a_0 ASC Sorting (None) Sorting (None) -- disable optimization -> sorting order is NOT propagated from subquery -> full sort @@ -36,8 +36,8 @@ Sorting (Stream): a ASC -- QUERY (analyzer): set optimize_read_in_order=1;set max_threads=3;set query_plan_remove_redundant_sorting=0;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a FROM optimize_sorting ORDER BY a Sorting (Global): a ASC Sorting (Sorting for ORDER BY) -Sorting (Global): default.optimize_sorting.a_0 ASC -Sorting (Stream): default.optimize_sorting.a_0 ASC +Sorting (Global): a_0 ASC +Sorting (Stream): a_0 ASC Sorting (Stream): a ASC -- QUERY: set optimize_read_in_order=1;set max_threads=3;set query_plan_remove_redundant_sorting=0;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a FROM optimize_sorting ORDER BY a+1 Sorting (None) @@ -48,8 +48,8 @@ Sorting (Chunk): a ASC -- QUERY (analyzer): set optimize_read_in_order=1;set max_threads=3;set query_plan_remove_redundant_sorting=0;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a FROM optimize_sorting ORDER BY a+1 Sorting (None) Sorting (Sorting for ORDER BY) -Sorting (Global): plus(default.optimize_sorting.a_0, 1_UInt8) ASC -Sorting (Chunk): default.optimize_sorting.a_0 ASC +Sorting (Global): plus(a_0, 1_UInt8) ASC +Sorting (Chunk): a_0 ASC Sorting (Chunk): a ASC -- ExpressionStep breaks sort mode -- QUERY: set optimize_read_in_order=1;set max_threads=3;set query_plan_remove_redundant_sorting=0;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a+1 FROM optimize_sorting ORDER BY a+1 @@ -61,7 +61,7 @@ Sorting (Chunk): a ASC -- QUERY (analyzer): set optimize_read_in_order=1;set max_threads=3;set query_plan_remove_redundant_sorting=0;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a+1 FROM optimize_sorting ORDER BY a+1 Sorting (Global): plus(a, 1) ASC Sorting (Sorting for ORDER BY) -Sorting (Global): plus(default.optimize_sorting.a_0, 1_UInt8) ASC +Sorting (Global): plus(a_0, 1_UInt8) ASC Sorting (None) Sorting (Chunk): a ASC -- FilterStep preserves sort mode @@ -71,7 +71,7 @@ Sorting (Chunk): a ASC Sorting (Chunk): a ASC -- QUERY (analyzer): set optimize_read_in_order=1;set max_threads=3;set query_plan_remove_redundant_sorting=0;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a FROM optimize_sorting WHERE a > 0 Sorting (Chunk): a ASC -Sorting (Chunk): default.optimize_sorting.a_0 ASC +Sorting (Chunk): a_0 ASC Sorting (Chunk): a ASC -- QUERY: set optimize_read_in_order=1;set max_threads=3;set query_plan_remove_redundant_sorting=0;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a FROM optimize_sorting WHERE a+1 > 0 Sorting (Chunk): a ASC @@ -79,7 +79,7 @@ Sorting (Chunk): a ASC Sorting (Chunk): a ASC -- QUERY (analyzer): set optimize_read_in_order=1;set max_threads=3;set query_plan_remove_redundant_sorting=0;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a FROM optimize_sorting WHERE a+1 > 0 Sorting (Chunk): a ASC -Sorting (Chunk): default.optimize_sorting.a_0 ASC +Sorting (Chunk): a_0 ASC Sorting (Chunk): a ASC -- QUERY: set optimize_read_in_order=1;set max_threads=3;set query_plan_remove_redundant_sorting=0;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a, a+1 FROM optimize_sorting WHERE a+1 > 0 Sorting (Chunk): a ASC @@ -87,7 +87,7 @@ Sorting (Chunk): a ASC Sorting (Chunk): a ASC -- QUERY (analyzer): set optimize_read_in_order=1;set max_threads=3;set query_plan_remove_redundant_sorting=0;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a, a+1 FROM optimize_sorting WHERE a+1 > 0 Sorting (Chunk): a ASC -Sorting (Chunk): default.optimize_sorting.a_0 ASC +Sorting (Chunk): a_0 ASC Sorting (Chunk): a ASC -- FilterStep breaks sort mode -- QUERY: set optimize_read_in_order=1;set max_threads=3;set query_plan_remove_redundant_sorting=0;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a > 0 FROM optimize_sorting WHERE a > 0 @@ -122,8 +122,8 @@ Sorting (Sorting for ORDER BY) Sorting (Global): a_0 ASC Sorting (None) Sorting (Sorting for ORDER BY) -Sorting (Global): default.optimize_sorting.a_2 ASC -Sorting (Stream): default.optimize_sorting.a_2 ASC +Sorting (Global): a_2 ASC +Sorting (Stream): a_2 ASC Sorting (Stream): a ASC -- aliases DONT break sorting order -- QUERY: set optimize_read_in_order=1;set max_threads=3;set query_plan_remove_redundant_sorting=0;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a, b FROM (SELECT x AS a, y AS b FROM (SELECT a AS x, b AS y FROM optimize_sorting) ORDER BY x, y) @@ -152,10 +152,10 @@ Sorting (Chunk): a ASC Sorting (None) Sorting (Sorting for ORDER BY) Sorting (Global): plus(a_0, 1_UInt8) ASC -Sorting (Global): plus(default.optimize_sorting.a_3, 1_UInt8) ASC +Sorting (Global): plus(a_3, 1_UInt8) ASC Sorting (Sorting for ORDER BY) -Sorting (Global): plus(default.optimize_sorting.a_3, 1_UInt8) ASC -Sorting (Chunk): default.optimize_sorting.a_3 ASC +Sorting (Global): plus(a_3, 1_UInt8) ASC +Sorting (Chunk): a_3 ASC Sorting (Chunk): a ASC -- check that correct sorting info is provided in case of only prefix of sorting key is in ORDER BY clause but all sorting key columns returned by query -- QUERY: set optimize_read_in_order=1;set max_threads=3;set query_plan_remove_redundant_sorting=0;EXPLAIN PLAN sorting=1 SELECT a, b FROM optimize_sorting ORDER BY a @@ -167,6 +167,6 @@ Sorting (Stream): a ASC -- QUERY (analyzer): set optimize_read_in_order=1;set max_threads=3;set query_plan_remove_redundant_sorting=0;EXPLAIN PLAN sorting=1 SELECT a, b FROM optimize_sorting ORDER BY a Sorting (Global): a ASC Sorting (Sorting for ORDER BY) -Sorting (Global): default.optimize_sorting.a_0 ASC -Sorting (Stream): default.optimize_sorting.a_0 ASC +Sorting (Global): a_0 ASC +Sorting (Stream): a_0 ASC Sorting (Stream): a ASC From d79bd5694ad6345e41aa640afb4d839c46da716d Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Wed, 24 May 2023 00:05:27 +0200 Subject: [PATCH 0226/1072] Fix LambdaNode::cloneImpl --- src/Analyzer/LambdaNode.cpp | 5 +++-- src/Analyzer/LambdaNode.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Analyzer/LambdaNode.cpp b/src/Analyzer/LambdaNode.cpp index 0d15c4d42e6..4be4d69c190 100644 --- a/src/Analyzer/LambdaNode.cpp +++ b/src/Analyzer/LambdaNode.cpp @@ -10,9 +10,10 @@ namespace DB { -LambdaNode::LambdaNode(Names argument_names_, QueryTreeNodePtr expression_) +LambdaNode::LambdaNode(Names argument_names_, QueryTreeNodePtr expression_, DataTypePtr result_type_) : IQueryTreeNode(children_size) , argument_names(std::move(argument_names_)) + , result_type(std::move(result_type_)) { auto arguments_list_node = std::make_shared(); auto & nodes = arguments_list_node->getNodes(); @@ -63,7 +64,7 @@ void LambdaNode::updateTreeHashImpl(HashState & state) const QueryTreeNodePtr LambdaNode::cloneImpl() const { - return std::make_shared(argument_names, getExpression()); + return std::make_shared(argument_names, getExpression(), result_type); } ASTPtr LambdaNode::toASTImpl(const ConvertToASTOptions & options) const diff --git a/src/Analyzer/LambdaNode.h b/src/Analyzer/LambdaNode.h index 355ed77cc6a..ea44a7e8187 100644 --- a/src/Analyzer/LambdaNode.h +++ b/src/Analyzer/LambdaNode.h @@ -35,7 +35,7 @@ class LambdaNode final : public IQueryTreeNode { public: /// Initialize lambda with argument names and lambda body expression - explicit LambdaNode(Names argument_names_, QueryTreeNodePtr expression_); + explicit LambdaNode(Names argument_names_, QueryTreeNodePtr expression_, DataTypePtr result_type_ = {}); /// Get argument names const Names & getArgumentNames() const From b86516131bef352b114e67c17b77706496bb0fd9 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Wed, 24 May 2023 16:22:58 +0000 Subject: [PATCH 0227/1072] Attempt to fix global JOINs and INs --- src/Storages/StorageDistributed.cpp | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index e0bb14c62fd..05f9821f6cb 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -30,6 +30,8 @@ #include #include #include +#include "Analyzer/IQueryTreeNode.h" +#include "Analyzer/MatcherNode.h" #include #include @@ -1003,7 +1005,17 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, planner_context->getMutableQueryContext(), global_in_or_join_node.subquery_depth); temporary_table_expression_node->setAlias(join_right_table_expression->getAlias()); - replacement_map.emplace(join_right_table_expression.get(), std::move(temporary_table_expression_node)); + + auto in_second_argument_query_node = std::make_shared(Context::createCopy(query_context)); + in_second_argument_query_node->setIsSubquery(true); + in_second_argument_query_node->getProjectionNode() = std::make_shared(); + in_second_argument_query_node->getProjection().getNodes() = { std::make_shared() }; + in_second_argument_query_node->getJoinTree() = std::move(temporary_table_expression_node); + + QueryAnalysisPass query_analysis_pass; + query_analysis_pass.run(in_second_argument_query_node, query_context); + + replacement_map.emplace(join_right_table_expression.get(), std::move(in_second_argument_query_node)); continue; } else if (auto * in_function_node = global_in_or_join_node.query_node->as()) @@ -1016,7 +1028,17 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, auto temporary_table_expression_node = executeSubqueryNode(in_function_subquery_node, planner_context->getMutableQueryContext(), global_in_or_join_node.subquery_depth); - in_function_subquery_node = std::move(temporary_table_expression_node); + + auto in_second_argument_query_node = std::make_shared(Context::createCopy(query_context)); + in_second_argument_query_node->setIsSubquery(true); + in_second_argument_query_node->getProjectionNode() = std::make_shared(); + in_second_argument_query_node->getProjection().getNodes() = { std::make_shared() }; + in_second_argument_query_node->getJoinTree() = std::move(temporary_table_expression_node); + + QueryAnalysisPass query_analysis_pass; + query_analysis_pass.run(in_second_argument_query_node, query_context); + + in_function_subquery_node = std::move(in_second_argument_query_node); } else { @@ -1059,9 +1081,8 @@ void StorageDistributed::read( storage_snapshot, remote_storage_id, remote_table_function_ptr); - - query_ast = queryNodeToSelectQuery(query_tree_distributed); header = InterpreterSelectQueryAnalyzer::getSampleBlock(query_tree_distributed, local_context, SelectQueryOptions(processed_stage).analyze()); + query_ast = queryNodeToSelectQuery(query_tree_distributed); } else { From 85e5ed79e5b60f00df2cf7d8c41b249485c02547 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 25 May 2023 15:58:08 +0000 Subject: [PATCH 0228/1072] Fix distributed JOINs --- src/Storages/StorageDistributed.cpp | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 05f9821f6cb..98c92541f67 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -1006,16 +1006,7 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, global_in_or_join_node.subquery_depth); temporary_table_expression_node->setAlias(join_right_table_expression->getAlias()); - auto in_second_argument_query_node = std::make_shared(Context::createCopy(query_context)); - in_second_argument_query_node->setIsSubquery(true); - in_second_argument_query_node->getProjectionNode() = std::make_shared(); - in_second_argument_query_node->getProjection().getNodes() = { std::make_shared() }; - in_second_argument_query_node->getJoinTree() = std::move(temporary_table_expression_node); - - QueryAnalysisPass query_analysis_pass; - query_analysis_pass.run(in_second_argument_query_node, query_context); - - replacement_map.emplace(join_right_table_expression.get(), std::move(in_second_argument_query_node)); + replacement_map.emplace(join_right_table_expression.get(), std::move(temporary_table_expression_node)); continue; } else if (auto * in_function_node = global_in_or_join_node.query_node->as()) From bc7b7f2cd2e1c56aacecd2855feb37bac10590c6 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 25 May 2023 16:00:13 +0000 Subject: [PATCH 0229/1072] Always add source alias in ColumnNode::toASTImpl --- src/Analyzer/ColumnNode.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Analyzer/ColumnNode.cpp b/src/Analyzer/ColumnNode.cpp index f020040ea78..76f5697afb0 100644 --- a/src/Analyzer/ColumnNode.cpp +++ b/src/Analyzer/ColumnNode.cpp @@ -96,7 +96,7 @@ ASTPtr ColumnNode::toASTImpl(const ConvertToASTOptions & options) const std::vector column_identifier_parts; auto column_source = getColumnSourceOrNull(); - if (column_source && options.fully_qualified_identifiers) + if (column_source) { auto node_type = column_source->getNodeType(); if (node_type == QueryTreeNodeType::TABLE || @@ -108,7 +108,8 @@ ASTPtr ColumnNode::toASTImpl(const ConvertToASTOptions & options) const { column_identifier_parts = {column_source->getAlias()}; } - else if (auto * table_node = column_source->as()) + else if (auto * table_node = column_source->as(); + table_node && options.fully_qualified_identifiers) { if (!table_node->getTemporaryTableName().empty()) { From c6dcb69b853528aacda070d00b3a873179470f82 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 30 May 2023 14:33:35 +0000 Subject: [PATCH 0230/1072] Fix GLOBAL IN --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 9 +++--- src/Planner/CollectSets.cpp | 3 +- src/Planner/Planner.cpp | 35 +++++++++++++++++++++-- src/Planner/PlannerContext.cpp | 3 +- src/Storages/StorageDistributed.cpp | 14 ++------- 5 files changed, 45 insertions(+), 19 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index b2bfa648435..c454ad9f84f 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -4767,13 +4767,14 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi auto * table_node = in_second_argument->as(); auto * table_function_node = in_second_argument->as(); - if (table_node && dynamic_cast(table_node->getStorage().get()) != nullptr) + if (table_node) { - /// If table is already prepared set, we do not replace it with subquery + /// If table is already prepared set, we do not replace it with subquery. + /// If table is not a StorageSet, we'll create plan to build set in the Planner. } - else if (table_node || table_function_node) + else if (table_function_node) { - const auto & storage_snapshot = table_node ? table_node->getStorageSnapshot() : table_function_node->getStorageSnapshot(); + const auto & storage_snapshot = table_function_node->getStorageSnapshot(); auto columns_to_select = storage_snapshot->getColumns(GetColumnsOptions(GetColumnsOptions::Ordinary)); size_t columns_to_select_size = columns_to_select.size(); diff --git a/src/Planner/CollectSets.cpp b/src/Planner/CollectSets.cpp index 02069aad292..eb2b02c7ccb 100644 --- a/src/Planner/CollectSets.cpp +++ b/src/Planner/CollectSets.cpp @@ -67,7 +67,8 @@ public: planner_context.registerSet(set_key, PlannerSet(FutureSet(std::move(set)))); } else if (in_second_argument_node_type == QueryTreeNodeType::QUERY || - in_second_argument_node_type == QueryTreeNodeType::UNION) + in_second_argument_node_type == QueryTreeNodeType::UNION || + in_second_argument_node_type == QueryTreeNodeType::TABLE) { planner_context.registerSet(set_key, PlannerSet(in_second_argument)); } diff --git a/src/Planner/Planner.cpp b/src/Planner/Planner.cpp index 7292f73e21c..5abf3ec5a80 100644 --- a/src/Planner/Planner.cpp +++ b/src/Planner/Planner.cpp @@ -43,6 +43,7 @@ #include #include +#include #include #include #include @@ -909,12 +910,42 @@ void addBuildSubqueriesForSetsStepIfNeeded(QueryPlan & query_plan, if (!planner_set) continue; - if (planner_set->getSet().isCreated() || !planner_set->getSubqueryNode()) + auto subquery_to_execute = planner_set->getSubqueryNode(); + + if (planner_set->getSet().isCreated() || !subquery_to_execute) continue; + if (auto * table_node = subquery_to_execute->as()) + { + auto storage_snapshot = table_node->getStorageSnapshot(); + auto columns_to_select = storage_snapshot->getColumns(GetColumnsOptions(GetColumnsOptions::Ordinary)); + + size_t columns_to_select_size = columns_to_select.size(); + + auto column_nodes_to_select = std::make_shared(); + column_nodes_to_select->getNodes().reserve(columns_to_select_size); + + NamesAndTypes projection_columns; + projection_columns.reserve(columns_to_select_size); + + for (auto & column : columns_to_select) + { + column_nodes_to_select->getNodes().emplace_back(std::make_shared(column, subquery_to_execute)); + projection_columns.emplace_back(column.name, column.type); + } + + auto subquery_for_table = std::make_shared(Context::createCopy(planner_context->getQueryContext())); + subquery_for_table->setIsSubquery(true); + subquery_for_table->getProjectionNode() = std::move(column_nodes_to_select); + subquery_for_table->getJoinTree() = std::move(subquery_to_execute); + subquery_for_table->resolveProjectionColumns(std::move(projection_columns)); + + subquery_to_execute = std::move(subquery_for_table); + } + auto subquery_options = select_query_options.subquery(); Planner subquery_planner( - planner_set->getSubqueryNode(), + subquery_to_execute, subquery_options, planner_context->getGlobalPlannerContext()); subquery_planner.buildQueryPlanIfNeeded(); diff --git a/src/Planner/PlannerContext.cpp b/src/Planner/PlannerContext.cpp index a788a6cbc3c..708dab04d02 100644 --- a/src/Planner/PlannerContext.cpp +++ b/src/Planner/PlannerContext.cpp @@ -129,7 +129,8 @@ void PlannerContext::registerSet(const SetKey & key, PlannerSet planner_set) auto node_type = subquery_node->getNodeType(); if (node_type != QueryTreeNodeType::QUERY && - node_type != QueryTreeNodeType::UNION) + node_type != QueryTreeNodeType::UNION && + node_type != QueryTreeNodeType::TABLE) throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid node for set table expression. Expected query or union. Actual {}", subquery_node->formatASTForErrorMessage()); diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 98c92541f67..f1fb4bb0c65 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -849,7 +849,7 @@ private: /** Execute subquery node and put result in mutable context temporary table. * Returns table node that is initialized with temporary table storage. */ -QueryTreeNodePtr executeSubqueryNode(const QueryTreeNodePtr & subquery_node, +TableNodePtr executeSubqueryNode(const QueryTreeNodePtr & subquery_node, ContextMutablePtr & mutable_context, size_t subquery_depth) { @@ -1019,17 +1019,9 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, auto temporary_table_expression_node = executeSubqueryNode(in_function_subquery_node, planner_context->getMutableQueryContext(), global_in_or_join_node.subquery_depth); + temporary_table_expression_node->setAlias(temporary_table_expression_node->getTemporaryTableName()); - auto in_second_argument_query_node = std::make_shared(Context::createCopy(query_context)); - in_second_argument_query_node->setIsSubquery(true); - in_second_argument_query_node->getProjectionNode() = std::make_shared(); - in_second_argument_query_node->getProjection().getNodes() = { std::make_shared() }; - in_second_argument_query_node->getJoinTree() = std::move(temporary_table_expression_node); - - QueryAnalysisPass query_analysis_pass; - query_analysis_pass.run(in_second_argument_query_node, query_context); - - in_function_subquery_node = std::move(in_second_argument_query_node); + in_function_subquery_node = std::move(temporary_table_expression_node); } else { From eb7ae91d0144a895ef53c862c79d930fe2cbdbab Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Wed, 31 May 2023 15:00:11 +0000 Subject: [PATCH 0231/1072] Do not add alias to a temporary table --- src/Storages/StorageDistributed.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index f1fb4bb0c65..43b1333413e 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -1019,7 +1019,6 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, auto temporary_table_expression_node = executeSubqueryNode(in_function_subquery_node, planner_context->getMutableQueryContext(), global_in_or_join_node.subquery_depth); - temporary_table_expression_node->setAlias(temporary_table_expression_node->getTemporaryTableName()); in_function_subquery_node = std::move(temporary_table_expression_node); } From 99b35eca0789cd9e3a055697486532bc1d4403d1 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Wed, 31 May 2023 15:05:48 +0000 Subject: [PATCH 0232/1072] Update reference files --- .../01561_clickhouse_client_stage.reference | 2 +- .../0_stateless/01591_window_functions.reference | 14 +++++++------- .../02048_clickhouse_local_stage.reference | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/queries/0_stateless/01561_clickhouse_client_stage.reference b/tests/queries/0_stateless/01561_clickhouse_client_stage.reference index 00e0f4ddb2e..8a34751b071 100644 --- a/tests/queries/0_stateless/01561_clickhouse_client_stage.reference +++ b/tests/queries/0_stateless/01561_clickhouse_client_stage.reference @@ -2,7 +2,7 @@ execute: --allow_experimental_analyzer=1 "foo" 1 execute: --allow_experimental_analyzer=1 --stage fetch_columns -"system.one.dummy_0" +"dummy_0" 0 execute: --allow_experimental_analyzer=1 --stage with_mergeable_state "1_UInt8" diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference index b981a46b4fd..8939ea1111d 100644 --- a/tests/queries/0_stateless/01591_window_functions.reference +++ b/tests/queries/0_stateless/01591_window_functions.reference @@ -917,9 +917,9 @@ from ; Expression ((Project names + Projection)) Window (Window step for window \'\') - Window (Window step for window \'PARTITION BY t.p_0\') - Window (Window step for window \'PARTITION BY t.p_0 ORDER BY t.o_1 ASC\') - Sorting (Sorting for window \'PARTITION BY t.p_0 ORDER BY t.o_1 ASC\') + Window (Window step for window \'PARTITION BY p_0\') + Window (Window step for window \'PARTITION BY p_0 ORDER BY o_1 ASC\') + Sorting (Sorting for window \'PARTITION BY p_0 ORDER BY o_1 ASC\') Expression ((Before WINDOW + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))) ReadFromStorage (SystemNumbers) explain select @@ -930,11 +930,11 @@ from from numbers(16)) t ; Expression ((Project names + Projection)) - Window (Window step for window \'ORDER BY t.o_0 ASC, t.number_1 ASC\') - Sorting (Sorting for window \'ORDER BY t.o_0 ASC, t.number_1 ASC\') - Window (Window step for window \'ORDER BY t.number_1 ASC\') + Window (Window step for window \'ORDER BY o_0 ASC, number_1 ASC\') + Sorting (Sorting for window \'ORDER BY o_0 ASC, number_1 ASC\') + Window (Window step for window \'ORDER BY number_1 ASC\') Expression ((Before WINDOW + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers)))) [lifted up part]) - Sorting (Sorting for window \'ORDER BY t.number_1 ASC\') + Sorting (Sorting for window \'ORDER BY number_1 ASC\') Expression ((Before WINDOW + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))) ReadFromStorage (SystemNumbers) -- A test case for the sort comparator found by fuzzer. diff --git a/tests/queries/0_stateless/02048_clickhouse_local_stage.reference b/tests/queries/0_stateless/02048_clickhouse_local_stage.reference index 00e0f4ddb2e..8a34751b071 100644 --- a/tests/queries/0_stateless/02048_clickhouse_local_stage.reference +++ b/tests/queries/0_stateless/02048_clickhouse_local_stage.reference @@ -2,7 +2,7 @@ execute: --allow_experimental_analyzer=1 "foo" 1 execute: --allow_experimental_analyzer=1 --stage fetch_columns -"system.one.dummy_0" +"dummy_0" 0 execute: --allow_experimental_analyzer=1 --stage with_mergeable_state "1_UInt8" From 883350d5c221c22bef476a62857ae7e8f692dcbf Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 1 Jun 2023 14:51:03 +0000 Subject: [PATCH 0233/1072] Fix tests --- src/Formats/CapnProtoSchema.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Formats/CapnProtoSchema.cpp b/src/Formats/CapnProtoSchema.cpp index f9ab88d39ed..559047a6f8d 100644 --- a/src/Formats/CapnProtoSchema.cpp +++ b/src/Formats/CapnProtoSchema.cpp @@ -43,7 +43,7 @@ capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaIn /// That's not good to determine the type of error by its description, but /// this is the only way to do it here, because kj doesn't specify the type of error. auto description = std::string_view(e.getDescription().cStr()); - if (description.find("No such file or directory") != String::npos || description.find("no such directory") != String::npos) + if (description.find("No such file or directory") != String::npos || description.find("no such directory") != String::npos || description.find("no such file") != String::npos) throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot open CapnProto schema, file {} doesn't exists", schema_info.absoluteSchemaPath()); if (description.find("Parse error") != String::npos) From f99a7366da356f145ee3ff124c4be80b4f5e903b Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 1 Jun 2023 16:56:18 +0200 Subject: [PATCH 0234/1072] Fix tests --- src/Storages/HDFS/StorageHDFS.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 08114ed3cba..79b7b65adb4 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -233,6 +233,7 @@ ColumnsDescription StorageHDFS::getTableStructureFromData( if (ctx->getSettingsRef().hdfs_skip_empty_files && path_with_info.info && path_with_info.info->size == 0) return read_buffer_iterator(columns); + first = false; auto compression = chooseCompressionMethod(path_with_info.path, compression_method); auto impl = std::make_unique(my_uri_without_path, path_with_info.path, ctx->getGlobalContext()->getConfigRef(), ctx->getReadSettings()); const Int64 zstd_window_log_max = ctx->getSettingsRef().zstd_window_log_max; From d9113a3b757841b9b956b1d38b479af61be37b72 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 1 Jun 2023 16:57:43 +0200 Subject: [PATCH 0235/1072] Style --- src/Storages/StorageS3.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index f3cad4de31a..29f0a747372 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -424,7 +424,6 @@ public: , bucket(bucket_) , query(query_) , virtual_header(virtual_header_) - { Strings all_keys = keys_; From 480db8622df10993a2df31e24255f96f58ef0094 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 1 Jun 2023 15:21:55 +0000 Subject: [PATCH 0236/1072] Always add table name while converting ColumnNode to AST --- src/Analyzer/ColumnNode.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/Analyzer/ColumnNode.cpp b/src/Analyzer/ColumnNode.cpp index 76f5697afb0..dd41522ac7d 100644 --- a/src/Analyzer/ColumnNode.cpp +++ b/src/Analyzer/ColumnNode.cpp @@ -108,8 +108,7 @@ ASTPtr ColumnNode::toASTImpl(const ConvertToASTOptions & options) const { column_identifier_parts = {column_source->getAlias()}; } - else if (auto * table_node = column_source->as(); - table_node && options.fully_qualified_identifiers) + else if (auto * table_node = column_source->as()) { if (!table_node->getTemporaryTableName().empty()) { @@ -118,7 +117,7 @@ ASTPtr ColumnNode::toASTImpl(const ConvertToASTOptions & options) const else { const auto & table_storage_id = table_node->getStorageID(); - if (table_storage_id.hasDatabase()) + if (table_storage_id.hasDatabase() && options.fully_qualified_identifiers) column_identifier_parts = { table_storage_id.getDatabaseName(), table_storage_id.getTableName() }; else column_identifier_parts = { table_storage_id.getTableName() }; From bdb192cf2742d6f3059f621068bbc59d78124229 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 1 Jun 2023 15:43:37 +0000 Subject: [PATCH 0237/1072] Test right join in 02725_any_join_single_row, style code changes --- src/Interpreters/HashJoin.cpp | 14 ++++---- .../02725_any_join_single_row.reference | 3 ++ .../0_stateless/02725_any_join_single_row.sql | 33 ++++++++++++++----- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 436ecd382cd..9306c9b99eb 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -556,7 +556,7 @@ namespace return false; } - static ALWAYS_INLINE bool insertAll(const HashJoin &, Map & map, KeyGetter & key_getter, Block * stored_block, size_t i, Arena & pool) + static ALWAYS_INLINE void insertAll(const HashJoin &, Map & map, KeyGetter & key_getter, Block * stored_block, size_t i, Arena & pool) { auto emplace_result = key_getter.emplaceKey(map, i, pool); @@ -567,10 +567,9 @@ namespace /// The first element of the list is stored in the value of the hash table, the rest in the pool. emplace_result.getMapped().insert({stored_block, i}, pool); } - return true; } - static ALWAYS_INLINE bool insertAsof(HashJoin & join, Map & map, KeyGetter & key_getter, Block * stored_block, size_t i, Arena & pool, + static ALWAYS_INLINE void insertAsof(HashJoin & join, Map & map, KeyGetter & key_getter, Block * stored_block, size_t i, Arena & pool, const IColumn & asof_column) { auto emplace_result = key_getter.emplaceKey(map, i, pool); @@ -580,7 +579,6 @@ namespace if (emplace_result.isInserted()) time_series_map = new (time_series_map) typename Map::mapped_type(createAsofRowRef(asof_type, join.getAsofInequality())); (*time_series_map)->insert(asof_column, stored_block, i); - return true; } }; @@ -599,7 +597,9 @@ namespace auto key_getter = createKeyGetter(key_columns, key_sizes); - is_inserted = false; + /// For ALL and ASOF join always insert values + is_inserted = !mapped_one || is_asof_join; + for (size_t i = 0; i < rows; ++i) { if (has_null_map && (*null_map)[i]) @@ -615,11 +615,11 @@ namespace continue; if constexpr (is_asof_join) - is_inserted |= Inserter::insertAsof(join, map, key_getter, stored_block, i, pool, *asof_column); + Inserter::insertAsof(join, map, key_getter, stored_block, i, pool, *asof_column); else if constexpr (mapped_one) is_inserted |= Inserter::insertOne(join, map, key_getter, stored_block, i, pool); else - is_inserted |= Inserter::insertAll(join, map, key_getter, stored_block, i, pool); + Inserter::insertAll(join, map, key_getter, stored_block, i, pool); } return map.getBufferSizeInCells(); } diff --git a/tests/queries/0_stateless/02725_any_join_single_row.reference b/tests/queries/0_stateless/02725_any_join_single_row.reference index 5d748fc6dbb..1e940bdc71e 100644 --- a/tests/queries/0_stateless/02725_any_join_single_row.reference +++ b/tests/queries/0_stateless/02725_any_join_single_row.reference @@ -1,3 +1,6 @@ Join(ANY, LEFT, key) 0 1 Join(ANY, LEFT, key) 1 1 Join(ANY, LEFT, key) 1 1 +1 +1 +1 diff --git a/tests/queries/0_stateless/02725_any_join_single_row.sql b/tests/queries/0_stateless/02725_any_join_single_row.sql index 5e5c959c278..f7ddd2f402b 100644 --- a/tests/queries/0_stateless/02725_any_join_single_row.sql +++ b/tests/queries/0_stateless/02725_any_join_single_row.sql @@ -1,26 +1,41 @@ -CREATE TABLE join_test -( - `key` UInt64, - `value` UInt64 -) -ENGINE = Join(ANY, LEFT, key); +DROP TABLE IF EXISTS join_test; +DROP TABLE IF EXISTS join_test_right; +CREATE TABLE join_test ( `key` UInt64, `value` UInt64 ) ENGINE = Join(ANY, LEFT, key); + +-- Save table size before inserting any rows CREATE TEMPORARY TABLE initial_table_size AS SELECT engine_full, total_rows, total_bytes FROM system.tables WHERE (name = 'join_test') AND (database = currentDatabase()); +-- Check that table size is less than 100K SELECT engine_full, total_rows, total_bytes < 100_000 FROM initial_table_size; INSERT INTO join_test (key, value) SELECT 1, number FROM numbers(1); - +-- Save table size after inserting one row CREATE TEMPORARY TABLE one_row_table_size AS SELECT engine_full, total_rows, total_bytes FROM system.tables WHERE (name = 'join_test') AND (database = currentDatabase()); +-- Check that table size is less than 2x after inserting one row SELECT engine_full, total_rows, total_bytes < 2 * (SELECT total_bytes FROM initial_table_size) FROM one_row_table_size; -INSERT INTO join_test (key, value) SELECT 1, number FROM numbers(1); -INSERT INTO join_test (key, value) SELECT 1, number FROM numbers(1); +-- Insert some more rows with the same key INSERT INTO join_test (key, value) SELECT 1, number FROM numbers(1); INSERT INTO join_test (key, value) SELECT 1, number FROM numbers(10_000); +-- Check that rows with the same key are not duplicated SELECT engine_full, total_rows, total_bytes == (SELECT total_bytes FROM one_row_table_size) FROM system.tables WHERE (name = 'join_test') AND (database = currentDatabase()); + +-- For RIGHT join we save all rows from the right table +CREATE TABLE join_test_right ( `key` UInt64, `value` UInt64 ) ENGINE = Join(ANY, RIGHT, key); + +INSERT INTO join_test_right (key, value) SELECT 1, number FROM numbers(1); +INSERT INTO join_test_right (key, value) SELECT 1, number FROM numbers(1); +INSERT INTO join_test_right (key, value) SELECT 1, number FROM numbers(1); +SELECT count() == 3 FROM (SELECT 1 as key) t1 ANY RIGHT JOIN join_test_right ON t1.key = join_test_right.key; +INSERT INTO join_test_right (key, value) SELECT 1, number FROM numbers(7); +SELECT count() == 10 FROM (SELECT 1 as key) t1 ANY RIGHT JOIN join_test_right ON t1.key = join_test_right.key; +SELECT count() == 10 FROM (SELECT 2 as key) t1 ANY RIGHT JOIN join_test_right ON t1.key = join_test_right.key; + +DROP TABLE IF EXISTS join_test; +DROP TABLE IF EXISTS join_test_right; From 4d65be4dbc30aaa764ddd8c888dca79fdc60bf07 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 1 Jun 2023 18:34:35 +0200 Subject: [PATCH 0238/1072] Ignore QEMU logging regarding IFA --- .../0_stateless/01103_check_cpu_instructions_at_startup.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01103_check_cpu_instructions_at_startup.sh b/tests/queries/0_stateless/01103_check_cpu_instructions_at_startup.sh index 9b6e1e05f2d..01047aeb9ab 100755 --- a/tests/queries/0_stateless/01103_check_cpu_instructions_at_startup.sh +++ b/tests/queries/0_stateless/01103_check_cpu_instructions_at_startup.sh @@ -19,7 +19,9 @@ fi function run_with_cpu() { - qemu-x86_64-static -cpu "$@" "$command" --query "SELECT 1" 2>&1 | grep -v -F "warning: TCG doesn't support requested feature" ||: + qemu-x86_64-static -cpu "$@" "$command" --query "SELECT 1" 2>&1 | \ + grep -v -F "warning: TCG doesn't support requested feature" | \ + grep -v -F 'Unknown host IFA type' ||: } run_with_cpu qemu64 From 02e986a9e7c1c33bf8818411de538c58af8a5198 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Thu, 1 Jun 2023 19:06:11 +0200 Subject: [PATCH 0239/1072] Revert "Revert "less logs in WriteBufferFromS3" (#50390)" This reverts commit 4d4112ff536f819514973dfd0cb8274cf044bb3e. --- src/IO/WriteBufferFromS3.cpp | 8 -------- src/IO/WriteBufferFromS3TaskTracker.cpp | 11 ----------- 2 files changed, 19 deletions(-) diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 462cf2674c3..6992c3ea4ac 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -195,18 +195,14 @@ void WriteBufferFromS3::finalizeImpl() if (request_settings.check_objects_after_upload) { - LOG_TRACE(log, "Checking object {} exists after upload", key); S3::checkObjectExists(*client_ptr, bucket, key, {}, request_settings, /* for_disk_s3= */ write_settings.for_object_storage, "Immediately after upload"); - LOG_TRACE(log, "Checking object {} has size as expected {}", key, total_size); size_t actual_size = S3::getObjectSize(*client_ptr, bucket, key, {}, request_settings, /* for_disk_s3= */ write_settings.for_object_storage); if (actual_size != total_size) throw Exception( ErrorCodes::S3_ERROR, "Object {} from bucket {} has unexpected size {} after upload, expected size {}, it's a bug in S3 or S3 API.", key, bucket, actual_size, total_size); - - LOG_TRACE(log, "Object {} exists after upload", key); } } @@ -292,8 +288,6 @@ void WriteBufferFromS3::reallocateFirstBuffer() WriteBuffer::set(memory.data() + hidden_size, memory.size() - hidden_size); chassert(offset() == 0); - - LOG_TRACE(log, "Reallocated first buffer with size {}. {}", memory.size(), getLogDetails()); } void WriteBufferFromS3::detachBuffer() @@ -316,8 +310,6 @@ void WriteBufferFromS3::allocateFirstBuffer() const auto size = std::min(size_t(DBMS_DEFAULT_BUFFER_SIZE), max_first_buffer); memory = Memory(size); WriteBuffer::set(memory.data(), memory.size()); - - LOG_TRACE(log, "Allocated first buffer with size {}. {}", memory.size(), getLogDetails()); } void WriteBufferFromS3::allocateBuffer() diff --git a/src/IO/WriteBufferFromS3TaskTracker.cpp b/src/IO/WriteBufferFromS3TaskTracker.cpp index 7ae31044012..c10af5d0672 100644 --- a/src/IO/WriteBufferFromS3TaskTracker.cpp +++ b/src/IO/WriteBufferFromS3TaskTracker.cpp @@ -36,8 +36,6 @@ ThreadPoolCallbackRunner WriteBufferFromS3::TaskTracker::syncRunner() void WriteBufferFromS3::TaskTracker::waitAll() { - LOG_TEST(log, "waitAll, in queue {}", futures.size()); - /// Exceptions are propagated for (auto & future : futures) { @@ -51,8 +49,6 @@ void WriteBufferFromS3::TaskTracker::waitAll() void WriteBufferFromS3::TaskTracker::safeWaitAll() { - LOG_TEST(log, "safeWaitAll, wait in queue {}", futures.size()); - for (auto & future : futures) { if (future.valid()) @@ -76,7 +72,6 @@ void WriteBufferFromS3::TaskTracker::safeWaitAll() void WriteBufferFromS3::TaskTracker::waitIfAny() { - LOG_TEST(log, "waitIfAny, in queue {}", futures.size()); if (futures.empty()) return; @@ -101,8 +96,6 @@ void WriteBufferFromS3::TaskTracker::waitIfAny() watch.stop(); ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds()); - - LOG_TEST(log, "waitIfAny ended, in queue {}", futures.size()); } void WriteBufferFromS3::TaskTracker::add(Callback && func) @@ -147,8 +140,6 @@ void WriteBufferFromS3::TaskTracker::waitTilInflightShrink() if (!max_tasks_inflight) return; - LOG_TEST(log, "waitTilInflightShrink, in queue {}", futures.size()); - Stopwatch watch; /// Alternative approach is to wait until at least futures.size() - max_tasks_inflight element are finished @@ -171,8 +162,6 @@ void WriteBufferFromS3::TaskTracker::waitTilInflightShrink() watch.stop(); ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds()); - - LOG_TEST(log, "waitTilInflightShrink ended, in queue {}", futures.size()); } bool WriteBufferFromS3::TaskTracker::isAsync() const From bd047ed9e0b580fc417c56239fa93340f6089388 Mon Sep 17 00:00:00 2001 From: Dan Roscigno Date: Thu, 1 Jun 2023 14:16:49 -0400 Subject: [PATCH 0240/1072] Update order-by.md add a note that ORDER BY sorting is case sensitive. closes https://github.com/ClickHouse/clickhouse-docs/issues/81 --- docs/en/sql-reference/statements/select/order-by.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/statements/select/order-by.md b/docs/en/sql-reference/statements/select/order-by.md index 712395a0357..1da6c1d8468 100644 --- a/docs/en/sql-reference/statements/select/order-by.md +++ b/docs/en/sql-reference/statements/select/order-by.md @@ -5,7 +5,7 @@ sidebar_label: ORDER BY # ORDER BY Clause -The `ORDER BY` clause contains a list of expressions, which can each be attributed with `DESC` (descending) or `ASC` (ascending) modifier which determine the sorting direction. If the direction is not specified, `ASC` is assumed, so it’s usually omitted. The sorting direction applies to a single expression, not to the entire list. Example: `ORDER BY Visits DESC, SearchPhrase`. +The `ORDER BY` clause contains a list of expressions, which can each be attributed with `DESC` (descending) or `ASC` (ascending) modifier which determine the sorting direction. If the direction is not specified, `ASC` is assumed, so it’s usually omitted. The sorting direction applies to a single expression, not to the entire list. Example: `ORDER BY Visits DESC, SearchPhrase`. Sorting is case-sensitive. If you want to sort by column numbers instead of column names, enable the setting [enable_positional_arguments](../../../operations/settings/settings.md#enable-positional-arguments). From 985cd8fc8a0fdbe09a95132f1fb549825ece636f Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 26 Apr 2023 21:20:19 +0200 Subject: [PATCH 0241/1072] Improve events logging --- tests/ci/terminate_runner_lambda/app.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/ci/terminate_runner_lambda/app.py b/tests/ci/terminate_runner_lambda/app.py index c9192417575..5a78c8c8e39 100644 --- a/tests/ci/terminate_runner_lambda/app.py +++ b/tests/ci/terminate_runner_lambda/app.py @@ -160,7 +160,7 @@ def get_candidates_to_be_killed(event_data: dict) -> Dict[str, List[str]]: def main(access_token: str, event: dict) -> Dict[str, List[str]]: - print("Got event", json.dumps(event, sort_keys=True, indent=4)) + print("Got event", json.dumps(event, sort_keys=True).replace("\n", "")) to_kill_by_zone = how_many_instances_to_kill(event) instances_by_zone = get_candidates_to_be_killed(event) @@ -177,7 +177,8 @@ def main(access_token: str, event: dict) -> Dict[str, List[str]]: total_to_kill += num_to_kill if num_to_kill > len(candidates): raise Exception( - f"Required to kill {num_to_kill}, but have only {len(candidates)} candidates in AV {zone}" + f"Required to kill {num_to_kill}, but have only {len(candidates)}" + f" candidates in AV {zone}" ) delete_for_av = [] # type: RunnerDescriptions @@ -207,7 +208,8 @@ def main(access_token: str, event: dict) -> Dict[str, List[str]]: if len(delete_for_av) < num_to_kill: print( - f"Checked all candidates for av {zone}, get to delete {len(delete_for_av)}, but still cannot get required {num_to_kill}" + f"Checked all candidates for av {zone}, get to delete " + f"{len(delete_for_av)}, but still cannot get required {num_to_kill}" ) instances_to_kill += [runner.name for runner in delete_for_av] From 7bf9089dcd34e0c29b1f951066136fec1d990372 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 27 Apr 2023 12:00:22 +0200 Subject: [PATCH 0242/1072] Increase access_token cached time --- tests/ci/terminate_runner_lambda/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/terminate_runner_lambda/app.py b/tests/ci/terminate_runner_lambda/app.py index 5a78c8c8e39..390375a34e4 100644 --- a/tests/ci/terminate_runner_lambda/app.py +++ b/tests/ci/terminate_runner_lambda/app.py @@ -64,7 +64,7 @@ cached_token = CachedToken(0, "") def get_cached_access_token() -> str: - if time.time() - 500 < cached_token.time: + if time.time() - 550 < cached_token.time: return cached_token.value private_key, app_id = get_key_and_app_from_aws() payload = { From bf9b563e0b2c3b1933405aef34d904afc7ade57f Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 27 Apr 2023 16:54:37 +0200 Subject: [PATCH 0243/1072] Improve caching mechanism for token, add cached instances --- tests/ci/terminate_runner_lambda/app.py | 40 ++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/tests/ci/terminate_runner_lambda/app.py b/tests/ci/terminate_runner_lambda/app.py index 390375a34e4..bf883880c8d 100644 --- a/tests/ci/terminate_runner_lambda/app.py +++ b/tests/ci/terminate_runner_lambda/app.py @@ -58,14 +58,19 @@ def get_access_token(jwt_token: str, installation_id: int) -> str: class CachedToken: time: int value: str + updating: bool = False cached_token = CachedToken(0, "") def get_cached_access_token() -> str: - if time.time() - 550 < cached_token.time: + if time.time() - 550 < cached_token.time or cached_token.updating: return cached_token.value + # Indicate that the value is updating now, so the cached value can be + # used. The first setting and close-to-ttl are not counted as update + if cached_token.time != 0 or time.time() - 590 < cached_token.time: + cached_token.updating = True private_key, app_id = get_key_and_app_from_aws() payload = { "iat": int(time.time()) - 60, @@ -77,9 +82,42 @@ def get_cached_access_token() -> str: installation_id = get_installation_id(encoded_jwt) cached_token.time = int(time.time()) cached_token.value = get_access_token(encoded_jwt, installation_id) + cached_token.updating = False return cached_token.value +@dataclass +class CachedInstances: + time: int + value: dict + updating: bool = False + + +cached_instances = CachedInstances(0, {}) + + +def get_cached_instances() -> dict: + """return cached instances description with updating it once per five minutes""" + if time.time() - 250 < cached_instances.time or cached_instances.updating: + return cached_instances.value + # Indicate that the value is updating now, so the cached value can be + # used. The first setting and close-to-ttl are not counted as update + if cached_instances.time != 0 or time.time() - 300 < cached_instances.time: + cached_instances.updating = True + ec2_client = boto3.client("ec2") + instances_response = ec2_client.describe_instances( + Filters=[{"Name": "instance-state-name", "Values": ["running"]}] + ) + cached_instances.time = int(time.time()) + cached_instances.value = { + instance["InstanceId"]: instance + for reservation in instances_response["Reservations"] + for instance in reservation["Instances"] + } + cached_instances.updating = False + return cached_instances.value + + RunnerDescription = namedtuple( "RunnerDescription", ["id", "name", "tags", "offline", "busy"] ) From 855afb56f913d3d7ba6dcc7f992b59cfaf5cb02e Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 27 Apr 2023 16:56:17 +0200 Subject: [PATCH 0244/1072] Get instances for the region to not kill a fresh one --- tests/ci/terminate_runner_lambda/app.py | 53 ++++++++++++++++++------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/tests/ci/terminate_runner_lambda/app.py b/tests/ci/terminate_runner_lambda/app.py index bf883880c8d..e0164bc58c0 100644 --- a/tests/ci/terminate_runner_lambda/app.py +++ b/tests/ci/terminate_runner_lambda/app.py @@ -198,11 +198,37 @@ def get_candidates_to_be_killed(event_data: dict) -> Dict[str, List[str]]: def main(access_token: str, event: dict) -> Dict[str, List[str]]: + start = time.time() print("Got event", json.dumps(event, sort_keys=True).replace("\n", "")) to_kill_by_zone = how_many_instances_to_kill(event) instances_by_zone = get_candidates_to_be_killed(event) + # Getting ASG and instances' descriptions from the API + # We don't kill instances that alive for less than 10 minutes, since they + # could be not in the GH active runners yet + print(f"Check other hosts from the same ASG {event['AutoScalingGroupName']}") + asg_client = boto3.client("autoscaling") + as_groups_response = asg_client.describe_auto_scaling_groups( + AutoScalingGroupNames=[event["AutoScalingGroupName"]] + ) + assert len(as_groups_response["AutoScalingGroups"]) == 1 + asg = as_groups_response["AutoScalingGroups"][0] + asg_instance_ids = [instance["InstanceId"] for instance in asg["Instances"]] + instance_descriptions = get_cached_instances() + # The instances launched less than 10 minutes ago + immune_ids = [ + instance["InstanceId"] + for instance in instance_descriptions.values() + if start - instance["LaunchTime"].timestamp() < 600 + ] + # if the ASG's instance ID not in instance_descriptions, it's most probably + # is not cached yet, so we must mark it as immuned + immune_ids.extend( + iid for iid in asg_instance_ids if iid not in instance_descriptions + ) + print("Time spent on the requests to AWS: ", time.time() - start) runners = list_runners(access_token) + runner_ids = set(runner.name for runner in runners) # We used to delete potential hosts to terminate from GitHub runners pool, # but the documentation states: # --- Returning an instance first in the response data does not guarantee its termination @@ -221,13 +247,17 @@ def main(access_token: str, event: dict) -> Dict[str, List[str]]: delete_for_av = [] # type: RunnerDescriptions for candidate in candidates: - if candidate not in set(runner.name for runner in runners): + if candidate in immune_ids: + print( + f"Candidate {candidate} started less than 10 minutes ago, won't touch a child" + ) + break + if candidate not in runner_ids: print( f"Candidate {candidate} was not in runners list, simply delete it" ) instances_to_kill.append(candidate) - - for candidate in candidates: + break if len(delete_for_av) + len(instances_to_kill) == num_to_kill: break if candidate in instances_to_kill: @@ -253,16 +283,11 @@ def main(access_token: str, event: dict) -> Dict[str, List[str]]: instances_to_kill += [runner.name for runner in delete_for_av] if len(instances_to_kill) < total_to_kill: - print(f"Check other hosts from the same ASG {event['AutoScalingGroupName']}") - client = boto3.client("autoscaling") - as_groups = client.describe_auto_scaling_groups( - AutoScalingGroupNames=[event["AutoScalingGroupName"]] - ) - assert len(as_groups["AutoScalingGroups"]) == 1 - asg = as_groups["AutoScalingGroups"][0] - for instance in asg["Instances"]: + for instance in asg_instance_ids: + if instance in immune_ids: + continue for runner in runners: - if runner.name == instance["InstanceId"] and not runner.busy: + if runner.name == instance and not runner.busy: print(f"Runner {runner.name} is not busy and can be deleted") instances_to_kill.append(runner.name) @@ -270,9 +295,9 @@ def main(access_token: str, event: dict) -> Dict[str, List[str]]: print("Got enough instances to kill") break - print("Got instances to kill: ", ", ".join(instances_to_kill)) response = {"InstanceIDs": instances_to_kill} - print(response) + print("Got instances to kill: ", response) + print("Time spent on the request: ", time.time() - start) return response From db029384110be42a55e8420e6a13091cd2dca164 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 4 May 2023 12:20:57 +0200 Subject: [PATCH 0245/1072] Do not count unfinished tasks with conclusion=None --- tests/ci/workflow_jobs_lambda/app.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/ci/workflow_jobs_lambda/app.py b/tests/ci/workflow_jobs_lambda/app.py index bc8e1212be5..c4ce68c3f8e 100644 --- a/tests/ci/workflow_jobs_lambda/app.py +++ b/tests/ci/workflow_jobs_lambda/app.py @@ -257,6 +257,7 @@ def handler(event: dict, context: Any) -> dict: else: event_data = json.loads(event["body"]) + logging.info("Got the next raw event from the github hook: %s", event_data) repo = event_data["repository"] try: wf_job = event_data["workflow_job"] @@ -265,6 +266,9 @@ def handler(event: dict, context: Any) -> dict: logging.error("The event data: %s", event) logging.error("The context data: %s", context) + # We record only finished steps + steps = len([step for step in wf_job["steps"] if step["conclusion"] is not None]) + workflow_job = WorkflowJob( wf_job["id"], wf_job["run_id"], @@ -281,7 +285,7 @@ def handler(event: dict, context: Any) -> dict: wf_job["started_at"], wf_job["completed_at"] or "1970-01-01T00:00:00", # nullable date wf_job["name"], - len(wf_job["steps"]), + steps, wf_job["check_run_url"], wf_job["labels"], wf_job["runner_id"] or 0, # nullable From 27941b4d2603a1308978bdc32c6db4d6ed0da7ef Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 22 May 2023 17:57:12 +0200 Subject: [PATCH 0246/1072] Decrease the time window for autoscale_runners_lambda --- tests/ci/autoscale_runners_lambda/app.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/ci/autoscale_runners_lambda/app.py b/tests/ci/autoscale_runners_lambda/app.py index cbc9f4f8901..3fbab0d13dc 100644 --- a/tests/ci/autoscale_runners_lambda/app.py +++ b/tests/ci/autoscale_runners_lambda/app.py @@ -22,10 +22,13 @@ RUNNER_TYPE_LABELS = [ "style-checker-aarch64", ] +### Update comment on the change ### # 4 HOUR - is a balance to get the most precise values # - Our longest possible running check is around 5h on the worst scenario # - The long queue won't be wiped out and replaced, so the measurmenet is fine # - If the data is spoiled by something, we are from the bills perspective +# Changed it to 3 HOUR: in average we have 1h tasks, but p90 is around 2h. +# With 4h we have too much wasted computing time in case of issues with DB QUEUE_QUERY = f"""SELECT last_status AS status, toUInt32(count()) AS length, @@ -40,7 +43,7 @@ FROM FROM default.workflow_jobs WHERE has(labels, 'self-hosted') AND hasAny({RUNNER_TYPE_LABELS}, labels) - AND started_at > now() - INTERVAL 4 HOUR + AND started_at > now() - INTERVAL 3 HOUR GROUP BY ALL HAVING last_status IN ('in_progress', 'queued') ) From 484c91c47e3aa8365eec5ae29d03fb66a8372bb0 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 22 May 2023 20:39:22 +0200 Subject: [PATCH 0247/1072] Add DRY_RUN and configurable PY_VERSION to lambda deployment --- .../ci/team_keys_lambda/build_and_deploy_archive.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/ci/team_keys_lambda/build_and_deploy_archive.sh b/tests/ci/team_keys_lambda/build_and_deploy_archive.sh index 4aee85c588a..f35d6456cd3 100644 --- a/tests/ci/team_keys_lambda/build_and_deploy_archive.sh +++ b/tests/ci/team_keys_lambda/build_and_deploy_archive.sh @@ -5,11 +5,17 @@ WORKDIR=$(dirname "$0") WORKDIR=$(readlink -f "${WORKDIR}") cd "$WORKDIR" -PY_VERSION=3.10 +# Do not deploy the lambda to AWS +DRY_RUN=${DRY_RUN:-} +# Python runtime to install dependencies +PY_VERSION=${PY_VERSION:-3.10} PY_EXEC="python${PY_VERSION}" +# Image to build the lambda zip package DOCKER_IMAGE="python:${PY_VERSION}-slim" LAMBDA_NAME=$(basename "$WORKDIR") +# Rename the_lambda_name directory to the-lambda-name lambda in AWS LAMBDA_NAME=${LAMBDA_NAME//_/-} +# The name of directory with lambda code PACKAGE=lambda-package rm -rf "$PACKAGE" "$PACKAGE".zip mkdir "$PACKAGE" @@ -28,4 +34,6 @@ if [ -f requirements.txt ]; then fi ( cd "$PACKAGE" && zip -9 -r ../"$PACKAGE".zip . ) -aws lambda update-function-code --function-name "$LAMBDA_NAME" --zip-file fileb://"$PACKAGE".zip +if [ -z "$DRY_RUN" ]; then + aws lambda update-function-code --function-name "$LAMBDA_NAME" --zip-file fileb://"$PACKAGE".zip +fi From 7f08f218d9ee71ba52d011cabce2faea7492c5ca Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 22 May 2023 23:07:35 +0200 Subject: [PATCH 0248/1072] Create lambda_shared package for lambdas --- .github/workflows/pull_request.yml | 5 ++ tests/ci/autoscale_runners_lambda/app.py | 77 +++---------------- .../autoscale_runners_lambda_test.py | 2 +- .../ci/autoscale_runners_lambda/lambda_shared | 1 + .../autoscale_runners_lambda/requirements.txt | 2 +- tests/ci/lambda_shared_package/.gitignore | 2 + .../lambda_shared/__init__.py | 74 ++++++++++++++++++ tests/ci/lambda_shared_package/pyproject.toml | 13 ++++ tests/ci/lambda_shared_package/setup.cfg | 8 ++ .../build_and_deploy_archive.sh | 11 +-- 10 files changed, 121 insertions(+), 74 deletions(-) rename tests/ci/{ => autoscale_runners_lambda}/autoscale_runners_lambda_test.py (98%) create mode 120000 tests/ci/autoscale_runners_lambda/lambda_shared create mode 100644 tests/ci/lambda_shared_package/.gitignore create mode 100644 tests/ci/lambda_shared_package/lambda_shared/__init__.py create mode 100644 tests/ci/lambda_shared_package/pyproject.toml create mode 100644 tests/ci/lambda_shared_package/setup.cfg diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 506ed451b6d..afc08f3e637 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -46,7 +46,12 @@ jobs: - name: Python unit tests run: | cd "$GITHUB_WORKSPACE/tests/ci" + echo "Testing the main ci directory" python3 -m unittest discover -s . -p '*_test.py' + for dir in *_lambda/; do + echo "Testing $dir" + python3 -m unittest discover -s "$dir" -p '*_test.py' + done DockerHubPushAarch64: needs: CheckLabels runs-on: [self-hosted, style-checker-aarch64] diff --git a/tests/ci/autoscale_runners_lambda/app.py b/tests/ci/autoscale_runners_lambda/app.py index 3fbab0d13dc..ab09afb3aa8 100644 --- a/tests/ci/autoscale_runners_lambda/app.py +++ b/tests/ci/autoscale_runners_lambda/app.py @@ -2,25 +2,19 @@ """The lambda to decrease/increase ASG desired capacity based on current queue""" -import json import logging -import time from dataclasses import dataclass from pprint import pformat from typing import Any, List, Literal, Optional, Tuple import boto3 # type: ignore -import requests # type: ignore -RUNNER_TYPE_LABELS = [ - "builder", - "func-tester", - "func-tester-aarch64", - "fuzzer-unit-tester", - "stress-tester", - "style-checker", - "style-checker-aarch64", -] +from lambda_shared import ( + CHException, + ClickHouseHelper, + RUNNER_TYPE_LABELS, + get_parameter_from_ssm, +) ### Update comment on the change ### # 4 HOUR - is a balance to get the most precise values @@ -74,61 +68,7 @@ def get_scales(runner_type: str) -> Tuple[int, int]: return scale_down, scale_up -### VENDORING -def get_parameter_from_ssm(name, decrypt=True, client=None): - if not client: - client = boto3.client("ssm", region_name="us-east-1") - return client.get_parameter(Name=name, WithDecryption=decrypt)["Parameter"]["Value"] - - -class CHException(Exception): - pass - - -class ClickHouseHelper: - def __init__( - self, - url: Optional[str] = None, - user: Optional[str] = None, - password: Optional[str] = None, - ): - self.url = url - self.auth = {} - if user: - self.auth["X-ClickHouse-User"] = user - if password: - self.auth["X-ClickHouse-Key"] = password - - def _select_and_get_json_each_row(self, db, query): - params = { - "database": db, - "query": query, - "default_format": "JSONEachRow", - } - for i in range(5): - response = None - try: - response = requests.get(self.url, params=params, headers=self.auth) - response.raise_for_status() - return response.text - except Exception as ex: - logging.warning("Cannot fetch data with exception %s", str(ex)) - if response: - logging.warning("Reponse text %s", response.text) - time.sleep(0.1 * i) - - raise CHException("Cannot fetch data from clickhouse") - - def select_json_each_row(self, db, query): - text = self._select_and_get_json_each_row(db, query) - result = [] - for line in text.split("\n"): - if line: - result.append(json.loads(line)) - return result - - -CH_CLIENT = ClickHouseHelper(get_parameter_from_ssm("clickhouse-test-stat-url"), "play") +CH_CLIENT = None # type: Optional[ClickHouseHelper] def set_capacity( @@ -222,6 +162,9 @@ def main(dry_run: bool = True) -> None: asg_client = boto3.client("autoscaling") try: global CH_CLIENT + CH_CLIENT = CH_CLIENT or ClickHouseHelper( + get_parameter_from_ssm("clickhouse-test-stat-url"), "play" + ) queues = CH_CLIENT.select_json_each_row("default", QUEUE_QUERY) except CHException as ex: logging.exception( diff --git a/tests/ci/autoscale_runners_lambda_test.py b/tests/ci/autoscale_runners_lambda/autoscale_runners_lambda_test.py similarity index 98% rename from tests/ci/autoscale_runners_lambda_test.py rename to tests/ci/autoscale_runners_lambda/autoscale_runners_lambda_test.py index 8e3828f51c0..6772e33374c 100644 --- a/tests/ci/autoscale_runners_lambda_test.py +++ b/tests/ci/autoscale_runners_lambda/autoscale_runners_lambda_test.py @@ -4,7 +4,7 @@ import unittest from dataclasses import dataclass from typing import Any, List -from autoscale_runners_lambda.app import set_capacity, Queue +from app import set_capacity, Queue @dataclass diff --git a/tests/ci/autoscale_runners_lambda/lambda_shared b/tests/ci/autoscale_runners_lambda/lambda_shared new file mode 120000 index 00000000000..ba86e090f6c --- /dev/null +++ b/tests/ci/autoscale_runners_lambda/lambda_shared @@ -0,0 +1 @@ +../lambda_shared_package/lambda_shared \ No newline at end of file diff --git a/tests/ci/autoscale_runners_lambda/requirements.txt b/tests/ci/autoscale_runners_lambda/requirements.txt index 3bcbe2dfd07..098e04a9798 100644 --- a/tests/ci/autoscale_runners_lambda/requirements.txt +++ b/tests/ci/autoscale_runners_lambda/requirements.txt @@ -1 +1 @@ -requests<2.30 +../lambda_shared_package diff --git a/tests/ci/lambda_shared_package/.gitignore b/tests/ci/lambda_shared_package/.gitignore new file mode 100644 index 00000000000..59d52651e06 --- /dev/null +++ b/tests/ci/lambda_shared_package/.gitignore @@ -0,0 +1,2 @@ +build +*.egg-info diff --git a/tests/ci/lambda_shared_package/lambda_shared/__init__.py b/tests/ci/lambda_shared_package/lambda_shared/__init__.py new file mode 100644 index 00000000000..c5ae4df9e17 --- /dev/null +++ b/tests/ci/lambda_shared_package/lambda_shared/__init__.py @@ -0,0 +1,74 @@ +"""The shared code and types for all our CI lambdas +It exists as __init__.py and lambda_shared/__init__.py to work both in local and venv""" + +import json +import logging +import time +from typing import List, Optional + +import boto3 # type: ignore +import requests # type: ignore + +RUNNER_TYPE_LABELS = [ + "builder", + "func-tester", + "func-tester-aarch64", + "fuzzer-unit-tester", + "stress-tester", + "style-checker", + "style-checker-aarch64", +] + + +### VENDORING +def get_parameter_from_ssm(name, decrypt=True, client=None): + if not client: + client = boto3.client("ssm", region_name="us-east-1") + return client.get_parameter(Name=name, WithDecryption=decrypt)["Parameter"]["Value"] + + +class CHException(Exception): + pass + + +class ClickHouseHelper: + def __init__( + self, + url: Optional[str] = None, + user: Optional[str] = None, + password: Optional[str] = None, + ): + self.url = url + self.auth = {} + if user: + self.auth["X-ClickHouse-User"] = user + if password: + self.auth["X-ClickHouse-Key"] = password + + def _select_and_get_json_each_row(self, db: str, query: str) -> str: + params = { + "database": db, + "query": query, + "default_format": "JSONEachRow", + } + for i in range(5): + response = None + try: + response = requests.get(self.url, params=params, headers=self.auth) + response.raise_for_status() + return response.text # type: ignore + except Exception as ex: + logging.warning("Cannot fetch data with exception %s", str(ex)) + if response: + logging.warning("Reponse text %s", response.text) + time.sleep(0.1 * i) + + raise CHException("Cannot fetch data from clickhouse") + + def select_json_each_row(self, db: str, query: str) -> List[dict]: # type: ignore + text = self._select_and_get_json_each_row(db, query) + result = [] + for line in text.split("\n"): + if line: + result.append(json.loads(line)) + return result diff --git a/tests/ci/lambda_shared_package/pyproject.toml b/tests/ci/lambda_shared_package/pyproject.toml new file mode 100644 index 00000000000..8b4b0a80948 --- /dev/null +++ b/tests/ci/lambda_shared_package/pyproject.toml @@ -0,0 +1,13 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "lambda_shared" +version = "0.0.1" +dependencies = [ + "requests < 2.30", +] + +[tool.distutils.bdist_wheel] +universal = true diff --git a/tests/ci/lambda_shared_package/setup.cfg b/tests/ci/lambda_shared_package/setup.cfg new file mode 100644 index 00000000000..744280ae41b --- /dev/null +++ b/tests/ci/lambda_shared_package/setup.cfg @@ -0,0 +1,8 @@ +### This file exists for clear builds in docker ### +# without it the `build` directory wouldn't be # +# updated on the fly and will require manual clean # +[build] +build_base = /tmp/lambda_shared + +[egg_info] +egg_base = /tmp/ diff --git a/tests/ci/team_keys_lambda/build_and_deploy_archive.sh b/tests/ci/team_keys_lambda/build_and_deploy_archive.sh index f35d6456cd3..89a2d514965 100644 --- a/tests/ci/team_keys_lambda/build_and_deploy_archive.sh +++ b/tests/ci/team_keys_lambda/build_and_deploy_archive.sh @@ -3,6 +3,7 @@ set -xeo pipefail WORKDIR=$(dirname "$0") WORKDIR=$(readlink -f "${WORKDIR}") +DIR_NAME=$(basename "$WORKDIR") cd "$WORKDIR" # Do not deploy the lambda to AWS @@ -12,9 +13,8 @@ PY_VERSION=${PY_VERSION:-3.10} PY_EXEC="python${PY_VERSION}" # Image to build the lambda zip package DOCKER_IMAGE="python:${PY_VERSION}-slim" -LAMBDA_NAME=$(basename "$WORKDIR") # Rename the_lambda_name directory to the-lambda-name lambda in AWS -LAMBDA_NAME=${LAMBDA_NAME//_/-} +LAMBDA_NAME=${DIR_NAME//_/-} # The name of directory with lambda code PACKAGE=lambda-package rm -rf "$PACKAGE" "$PACKAGE".zip @@ -23,8 +23,9 @@ cp app.py "$PACKAGE" if [ -f requirements.txt ]; then VENV=lambda-venv rm -rf "$VENV" lambda-package.zip - docker run --rm --user="${UID}" --volume="${WORKDIR}:/lambda" --workdir="/lambda" "${DOCKER_IMAGE}" \ - /bin/bash -c " + docker run --rm --user="${UID}" -e HOME=/tmp \ + --volume="${WORKDIR}/..:/ci" --workdir="/ci/${DIR_NAME}" "${DOCKER_IMAGE}" \ + /bin/bash -exc " '$PY_EXEC' -m venv '$VENV' && source '$VENV/bin/activate' && pip install -r requirements.txt @@ -35,5 +36,5 @@ fi ( cd "$PACKAGE" && zip -9 -r ../"$PACKAGE".zip . ) if [ -z "$DRY_RUN" ]; then - aws lambda update-function-code --function-name "$LAMBDA_NAME" --zip-file fileb://"$PACKAGE".zip + aws lambda update-function-code --function-name "$LAMBDA_NAME" --zip-file fileb://"$WORKDIR/$PACKAGE".zip fi From 0fa6a8416148a9f47f8d5c4e6a6bdb67992d2cd4 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 23 May 2023 12:55:23 +0200 Subject: [PATCH 0249/1072] Move the stuff related to runners to lambda_shared --- tests/ci/ci_runners_metrics_lambda/app.py | 68 ++----------------- .../ci_runners_metrics_lambda/lambda_shared | 1 + .../requirements.txt | 2 +- .../lambda_shared/__init__.py | 67 ++++++++++++++++-- tests/ci/terminate_runner_lambda/app.py | 55 +-------------- .../ci/terminate_runner_lambda/lambda_shared | 1 + .../terminate_runner_lambda/requirements.txt | 2 +- 7 files changed, 76 insertions(+), 120 deletions(-) create mode 120000 tests/ci/ci_runners_metrics_lambda/lambda_shared create mode 120000 tests/ci/terminate_runner_lambda/lambda_shared diff --git a/tests/ci/ci_runners_metrics_lambda/app.py b/tests/ci/ci_runners_metrics_lambda/app.py index 341e1b674ec..a12143752a1 100644 --- a/tests/ci/ci_runners_metrics_lambda/app.py +++ b/tests/ci/ci_runners_metrics_lambda/app.py @@ -10,7 +10,6 @@ import argparse import sys import json import time -from collections import namedtuple from datetime import datetime from typing import Dict, List, Tuple @@ -19,21 +18,14 @@ import requests # type: ignore import boto3 # type: ignore from botocore.exceptions import ClientError # type: ignore -UNIVERSAL_LABEL = "universal" -RUNNER_TYPE_LABELS = [ - "builder", - "func-tester", - "func-tester-aarch64", - "fuzzer-unit-tester", - "stress-tester", - "style-checker", - "style-checker-aarch64", -] - -RunnerDescription = namedtuple( - "RunnerDescription", ["id", "name", "tags", "offline", "busy"] +from lambda_shared import ( + RUNNER_TYPE_LABELS, + RunnerDescription, + RunnerDescriptions, + list_runners, ) -RunnerDescriptions = List[RunnerDescription] + +UNIVERSAL_LABEL = "universal" def get_dead_runners_in_ec2(runners: RunnerDescriptions) -> RunnerDescriptions: @@ -193,52 +185,6 @@ def get_access_token(jwt_token: str, installation_id: int) -> str: return data["token"] # type: ignore -def list_runners(access_token: str) -> RunnerDescriptions: - headers = { - "Authorization": f"token {access_token}", - "Accept": "application/vnd.github.v3+json", - } - per_page = 100 - response = requests.get( - f"https://api.github.com/orgs/ClickHouse/actions/runners?per_page={per_page}", - headers=headers, - ) - response.raise_for_status() - data = response.json() - total_runners = data["total_count"] - print("Expected total runners", total_runners) - runners = data["runners"] - - # round to 0 for 0, 1 for 1..100, but to 2 for 101..200 - total_pages = (total_runners - 1) // per_page + 1 - - print("Total pages", total_pages) - for i in range(2, total_pages + 1): - response = requests.get( - "https://api.github.com/orgs/ClickHouse/actions/runners" - f"?page={i}&per_page={per_page}", - headers=headers, - ) - response.raise_for_status() - data = response.json() - runners += data["runners"] - - print("Total runners", len(runners)) - result = [] - for runner in runners: - tags = [tag["name"] for tag in runner["labels"]] - desc = RunnerDescription( - id=runner["id"], - name=runner["name"], - tags=tags, - offline=runner["status"] == "offline", - busy=runner["busy"], - ) - result.append(desc) - - return result - - def group_runners_by_tag( listed_runners: RunnerDescriptions, ) -> Dict[str, RunnerDescriptions]: diff --git a/tests/ci/ci_runners_metrics_lambda/lambda_shared b/tests/ci/ci_runners_metrics_lambda/lambda_shared new file mode 120000 index 00000000000..ba86e090f6c --- /dev/null +++ b/tests/ci/ci_runners_metrics_lambda/lambda_shared @@ -0,0 +1 @@ +../lambda_shared_package/lambda_shared \ No newline at end of file diff --git a/tests/ci/ci_runners_metrics_lambda/requirements.txt b/tests/ci/ci_runners_metrics_lambda/requirements.txt index 98be09ab232..e99dee1743c 100644 --- a/tests/ci/ci_runners_metrics_lambda/requirements.txt +++ b/tests/ci/ci_runners_metrics_lambda/requirements.txt @@ -1,3 +1,3 @@ -requests<2.30 +../lambda_shared_package PyJWT cryptography<38 diff --git a/tests/ci/lambda_shared_package/lambda_shared/__init__.py b/tests/ci/lambda_shared_package/lambda_shared/__init__.py index c5ae4df9e17..fe52f98d5f6 100644 --- a/tests/ci/lambda_shared_package/lambda_shared/__init__.py +++ b/tests/ci/lambda_shared_package/lambda_shared/__init__.py @@ -4,7 +4,8 @@ It exists as __init__.py and lambda_shared/__init__.py to work both in local and import json import logging import time -from typing import List, Optional +from collections import namedtuple +from typing import Any, List, Optional import boto3 # type: ignore import requests # type: ignore @@ -21,10 +22,14 @@ RUNNER_TYPE_LABELS = [ ### VENDORING -def get_parameter_from_ssm(name, decrypt=True, client=None): +def get_parameter_from_ssm( + name: str, decrypt: bool = True, client: Optional[Any] = None +) -> str: if not client: client = boto3.client("ssm", region_name="us-east-1") - return client.get_parameter(Name=name, WithDecryption=decrypt)["Parameter"]["Value"] + return client.get_parameter(Name=name, WithDecryption=decrypt)[ # type: ignore + "Parameter" + ]["Value"] class CHException(Exception): @@ -65,10 +70,64 @@ class ClickHouseHelper: raise CHException("Cannot fetch data from clickhouse") - def select_json_each_row(self, db: str, query: str) -> List[dict]: # type: ignore + def select_json_each_row(self, db: str, query: str) -> List[dict]: text = self._select_and_get_json_each_row(db, query) result = [] for line in text.split("\n"): if line: result.append(json.loads(line)) return result + + +### Runners + +RunnerDescription = namedtuple( + "RunnerDescription", ["id", "name", "tags", "offline", "busy"] +) +RunnerDescriptions = List[RunnerDescription] + + +def list_runners(access_token: str) -> RunnerDescriptions: + headers = { + "Authorization": f"token {access_token}", + "Accept": "application/vnd.github.v3+json", + } + per_page = 100 + response = requests.get( + f"https://api.github.com/orgs/ClickHouse/actions/runners?per_page={per_page}", + headers=headers, + ) + response.raise_for_status() + data = response.json() + total_runners = data["total_count"] + print("Expected total runners", total_runners) + runners = data["runners"] + + # round to 0 for 0, 1 for 1..100, but to 2 for 101..200 + total_pages = (total_runners - 1) // per_page + 1 + + print("Total pages", total_pages) + for i in range(2, total_pages + 1): + response = requests.get( + "https://api.github.com/orgs/ClickHouse/actions/runners" + f"?page={i}&per_page={per_page}", + headers=headers, + ) + response.raise_for_status() + data = response.json() + runners += data["runners"] + + print("Total runners", len(runners)) + result = [] + for runner in runners: + tags = [tag["name"] for tag in runner["labels"]] + desc = RunnerDescription( + id=runner["id"], + name=runner["name"], + tags=tags, + offline=runner["status"] == "offline", + busy=runner["busy"], + ) + result.append(desc) + + return result diff --git a/tests/ci/terminate_runner_lambda/app.py b/tests/ci/terminate_runner_lambda/app.py index e0164bc58c0..5799a498d5a 100644 --- a/tests/ci/terminate_runner_lambda/app.py +++ b/tests/ci/terminate_runner_lambda/app.py @@ -4,7 +4,6 @@ import argparse import json import sys import time -from collections import namedtuple from dataclasses import dataclass from typing import Any, Dict, List, Tuple @@ -12,6 +11,8 @@ import boto3 # type: ignore import requests # type: ignore import jwt +from lambda_shared import RunnerDescriptions, list_runners + def get_key_and_app_from_aws() -> Tuple[str, int]: secret_name = "clickhouse_github_secret_key" @@ -118,58 +119,6 @@ def get_cached_instances() -> dict: return cached_instances.value -RunnerDescription = namedtuple( - "RunnerDescription", ["id", "name", "tags", "offline", "busy"] -) -RunnerDescriptions = List[RunnerDescription] - - -def list_runners(access_token: str) -> RunnerDescriptions: - headers = { - "Authorization": f"token {access_token}", - "Accept": "application/vnd.github.v3+json", - } - per_page = 100 - response = requests.get( - f"https://api.github.com/orgs/ClickHouse/actions/runners?per_page={per_page}", - headers=headers, - ) - response.raise_for_status() - data = response.json() - total_runners = data["total_count"] - print("Expected total runners", total_runners) - runners = data["runners"] - - # round to 0 for 0, 1 for 1..100, but to 2 for 101..200 - total_pages = (total_runners - 1) // per_page + 1 - - print("Total pages", total_pages) - for i in range(2, total_pages + 1): - response = requests.get( - "https://api.github.com/orgs/ClickHouse/actions/runners" - f"?page={i}&per_page={per_page}", - headers=headers, - ) - response.raise_for_status() - data = response.json() - runners += data["runners"] - - print("Total runners", len(runners)) - result = [] - for runner in runners: - tags = [tag["name"] for tag in runner["labels"]] - desc = RunnerDescription( - id=runner["id"], - name=runner["name"], - tags=tags, - offline=runner["status"] == "offline", - busy=runner["busy"], - ) - result.append(desc) - - return result - - def how_many_instances_to_kill(event_data: dict) -> Dict[str, int]: data_array = event_data["CapacityToTerminate"] to_kill_by_zone = {} # type: Dict[str, int] diff --git a/tests/ci/terminate_runner_lambda/lambda_shared b/tests/ci/terminate_runner_lambda/lambda_shared new file mode 120000 index 00000000000..ba86e090f6c --- /dev/null +++ b/tests/ci/terminate_runner_lambda/lambda_shared @@ -0,0 +1 @@ +../lambda_shared_package/lambda_shared \ No newline at end of file diff --git a/tests/ci/terminate_runner_lambda/requirements.txt b/tests/ci/terminate_runner_lambda/requirements.txt index 98be09ab232..e99dee1743c 100644 --- a/tests/ci/terminate_runner_lambda/requirements.txt +++ b/tests/ci/terminate_runner_lambda/requirements.txt @@ -1,3 +1,3 @@ -requests<2.30 +../lambda_shared_package PyJWT cryptography<38 From acb9531ebf829a5c11bbeff6661e8d2122334ee6 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 23 May 2023 18:47:19 +0200 Subject: [PATCH 0250/1072] Unify and put GH access token to the lambda_shared --- .../cancel_and_rerun_workflow_lambda/app.py | 176 +---------------- .../lambda_shared | 1 + .../requirements.txt | 4 +- tests/ci/ci_runners_metrics_lambda/app.py | 70 ++----- .../requirements.txt | 3 +- tests/ci/lambda_shared_package/__init__.py | 0 .../lambda_shared_package/lambda_shared/pr.py | 184 ++++++++++++++++++ .../lambda_shared/token.py | 90 +++++++++ tests/ci/lambda_shared_package/pyproject.toml | 10 + tests/ci/run_check.py | 8 +- tests/ci/runner_token_rotation_lambda/app.py | 61 +----- .../lambda_shared | 1 + .../requirements.txt | 4 +- tests/ci/team_keys_lambda/app.py | 4 +- tests/ci/team_keys_lambda/lambda_shared | 1 + tests/ci/team_keys_lambda/requirements.txt | 2 +- tests/ci/terminate_runner_lambda/app.py | 92 +-------- .../terminate_runner_lambda/requirements.txt | 4 +- tests/ci/workflow_approve_rerun_lambda/app.py | 122 +----------- .../lambda_shared | 1 + .../requirements.txt | 4 +- tests/ci/workflow_jobs_lambda/lambda_shared | 1 + 22 files changed, 332 insertions(+), 511 deletions(-) create mode 120000 tests/ci/cancel_and_rerun_workflow_lambda/lambda_shared create mode 100644 tests/ci/lambda_shared_package/__init__.py create mode 100644 tests/ci/lambda_shared_package/lambda_shared/pr.py create mode 100644 tests/ci/lambda_shared_package/lambda_shared/token.py create mode 120000 tests/ci/runner_token_rotation_lambda/lambda_shared create mode 120000 tests/ci/team_keys_lambda/lambda_shared create mode 120000 tests/ci/workflow_approve_rerun_lambda/lambda_shared create mode 120000 tests/ci/workflow_jobs_lambda/lambda_shared diff --git a/tests/ci/cancel_and_rerun_workflow_lambda/app.py b/tests/ci/cancel_and_rerun_workflow_lambda/app.py index 54c87fbcfa5..250655ddeb2 100644 --- a/tests/ci/cancel_and_rerun_workflow_lambda/app.py +++ b/tests/ci/cancel_and_rerun_workflow_lambda/app.py @@ -9,9 +9,10 @@ import json import re import time -import jwt import requests # type: ignore -import boto3 # type: ignore + +from lambda_shared.pr import CATEGORY_TO_LABEL, check_pr_description +from lambda_shared.token import get_cached_access_token NEED_RERUN_ON_EDITED = { @@ -27,123 +28,6 @@ MAX_RETRY = 5 DEBUG_INFO = {} # type: Dict[str, Any] -# Descriptions are used in .github/PULL_REQUEST_TEMPLATE.md, keep comments there -# updated accordingly -# The following lists are append only, try to avoid editing them -# They still could be cleaned out after the decent time though. -LABELS = { - "pr-backward-incompatible": ["Backward Incompatible Change"], - "pr-bugfix": [ - "Bug Fix", - "Bug Fix (user-visible misbehavior in an official stable release)", - "Bug Fix (user-visible misbehaviour in official stable or prestable release)", - "Bug Fix (user-visible misbehavior in official stable or prestable release)", - ], - "pr-build": [ - "Build/Testing/Packaging Improvement", - "Build Improvement", - "Build/Testing Improvement", - "Build", - "Packaging Improvement", - ], - "pr-documentation": [ - "Documentation (changelog entry is not required)", - "Documentation", - ], - "pr-feature": ["New Feature"], - "pr-improvement": ["Improvement"], - "pr-not-for-changelog": [ - "Not for changelog (changelog entry is not required)", - "Not for changelog", - ], - "pr-performance": ["Performance Improvement"], -} - -CATEGORY_TO_LABEL = {c: lb for lb, categories in LABELS.items() for c in categories} - - -def check_pr_description(pr_body: str) -> Tuple[str, str]: - """The function checks the body to being properly formatted according to - .github/PULL_REQUEST_TEMPLATE.md, if the first returned string is not empty, - then there is an error.""" - lines = list(map(lambda x: x.strip(), pr_body.split("\n") if pr_body else [])) - lines = [re.sub(r"\s+", " ", line) for line in lines] - - # Check if body contains "Reverts ClickHouse/ClickHouse#36337" - if [ - True - for line in lines - if re.match(r"\AReverts {GITHUB_REPOSITORY}#[\d]+\Z", line) - ]: - return "", LABELS["pr-not-for-changelog"][0] - - category = "" - entry = "" - description_error = "" - - i = 0 - while i < len(lines): - if re.match(r"(?i)^[#>*_ ]*change\s*log\s*category", lines[i]): - i += 1 - if i >= len(lines): - break - # Can have one empty line between header and the category - # itself. Filter it out. - if not lines[i]: - i += 1 - if i >= len(lines): - break - category = re.sub(r"^[-*\s]*", "", lines[i]) - i += 1 - - # Should not have more than one category. Require empty line - # after the first found category. - if i >= len(lines): - break - if lines[i]: - second_category = re.sub(r"^[-*\s]*", "", lines[i]) - description_error = ( - "More than one changelog category specified: " - f"'{category}', '{second_category}'" - ) - return description_error, category - - elif re.match( - r"(?i)^[#>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i] - ): - i += 1 - # Can have one empty line between header and the entry itself. - # Filter it out. - if i < len(lines) and not lines[i]: - i += 1 - # All following lines until empty one are the changelog entry. - entry_lines = [] - while i < len(lines) and lines[i]: - entry_lines.append(lines[i]) - i += 1 - entry = " ".join(entry_lines) - # Don't accept changelog entries like '...'. - entry = re.sub(r"[#>*_.\- ]", "", entry) - # Don't accept changelog entries like 'Close #12345'. - entry = re.sub(r"^[\w\-\s]{0,10}#?\d{5,6}\.?$", "", entry) - else: - i += 1 - - if not category: - description_error = "Changelog category is empty" - # Filter out the PR categories that are not for changelog. - elif re.match( - r"(?i)doc|((non|in|not|un)[-\s]*significant)|(not[ ]*for[ ]*changelog)", - category, - ): - pass # to not check the rest of the conditions - elif category not in CATEGORY_TO_LABEL: - description_error, category = f"Category '{category}' is not valid", "" - elif not entry: - description_error = f"Changelog entry required for category '{category}'" - - return description_error, category - class Worker(Thread): def __init__( @@ -166,58 +50,6 @@ class Worker(Thread): self.queue.task_done() -def get_installation_id(jwt_token): - headers = { - "Authorization": f"Bearer {jwt_token}", - "Accept": "application/vnd.github.v3+json", - } - response = requests.get("https://api.github.com/app/installations", headers=headers) - response.raise_for_status() - data = response.json() - for installation in data: - if installation["account"]["login"] == "ClickHouse": - installation_id = installation["id"] - return installation_id - - -def get_access_token(jwt_token, installation_id): - headers = { - "Authorization": f"Bearer {jwt_token}", - "Accept": "application/vnd.github.v3+json", - } - response = requests.post( - f"https://api.github.com/app/installations/{installation_id}/access_tokens", - headers=headers, - ) - response.raise_for_status() - data = response.json() - return data["token"] - - -def get_key_and_app_from_aws(): - secret_name = "clickhouse_github_secret_key" - session = boto3.session.Session() - client = session.client( - service_name="secretsmanager", - ) - get_secret_value_response = client.get_secret_value(SecretId=secret_name) - data = json.loads(get_secret_value_response["SecretString"]) - return data["clickhouse-app-key"], int(data["clickhouse-app-id"]) - - -def get_token_from_aws(): - private_key, app_id = get_key_and_app_from_aws() - payload = { - "iat": int(time.time()) - 60, - "exp": int(time.time()) + (10 * 60), - "iss": app_id, - } - - encoded_jwt = jwt.encode(payload, private_key, algorithm="RS256") - installation_id = get_installation_id(encoded_jwt) - return get_access_token(encoded_jwt, installation_id) - - def _exec_get_with_retry(url: str, token: str) -> dict: headers = {"Authorization": f"token {token}"} for i in range(MAX_RETRY): @@ -407,7 +239,7 @@ def exec_workflow_url(urls_to_post, token): def main(event): - token = get_token_from_aws() + token = get_cached_access_token() DEBUG_INFO["event"] = event if event["isBase64Encoded"]: event_data = json.loads(b64decode(event["body"])) diff --git a/tests/ci/cancel_and_rerun_workflow_lambda/lambda_shared b/tests/ci/cancel_and_rerun_workflow_lambda/lambda_shared new file mode 120000 index 00000000000..ba86e090f6c --- /dev/null +++ b/tests/ci/cancel_and_rerun_workflow_lambda/lambda_shared @@ -0,0 +1 @@ +../lambda_shared_package/lambda_shared \ No newline at end of file diff --git a/tests/ci/cancel_and_rerun_workflow_lambda/requirements.txt b/tests/ci/cancel_and_rerun_workflow_lambda/requirements.txt index 98be09ab232..4cb3fba0f7b 100644 --- a/tests/ci/cancel_and_rerun_workflow_lambda/requirements.txt +++ b/tests/ci/cancel_and_rerun_workflow_lambda/requirements.txt @@ -1,3 +1 @@ -requests<2.30 -PyJWT -cryptography<38 +../lambda_shared_package[token] diff --git a/tests/ci/ci_runners_metrics_lambda/app.py b/tests/ci/ci_runners_metrics_lambda/app.py index a12143752a1..dc128dea739 100644 --- a/tests/ci/ci_runners_metrics_lambda/app.py +++ b/tests/ci/ci_runners_metrics_lambda/app.py @@ -8,12 +8,9 @@ Lambda function to: import argparse import sys -import json -import time from datetime import datetime -from typing import Dict, List, Tuple +from typing import Dict, List -import jwt import requests # type: ignore import boto3 # type: ignore from botocore.exceptions import ClientError # type: ignore @@ -24,6 +21,11 @@ from lambda_shared import ( RunnerDescriptions, list_runners, ) +from lambda_shared.token import ( + get_cached_access_token, + get_key_and_app_from_aws, + get_access_token_by_key_app, +) UNIVERSAL_LABEL = "universal" @@ -139,50 +141,8 @@ def get_lost_ec2_instances(runners: RunnerDescriptions) -> List[dict]: return lost_instances -def get_key_and_app_from_aws() -> Tuple[str, int]: - secret_name = "clickhouse_github_secret_key" - session = boto3.session.Session() - client = session.client( - service_name="secretsmanager", - ) - get_secret_value_response = client.get_secret_value(SecretId=secret_name) - data = json.loads(get_secret_value_response["SecretString"]) - return data["clickhouse-app-key"], int(data["clickhouse-app-id"]) - - def handler(event, context): - private_key, app_id = get_key_and_app_from_aws() - main(private_key, app_id, True, True) - - -def get_installation_id(jwt_token: str) -> int: - headers = { - "Authorization": f"Bearer {jwt_token}", - "Accept": "application/vnd.github.v3+json", - } - response = requests.get("https://api.github.com/app/installations", headers=headers) - response.raise_for_status() - data = response.json() - for installation in data: - if installation["account"]["login"] == "ClickHouse": - installation_id = installation["id"] - break - - return installation_id # type: ignore - - -def get_access_token(jwt_token: str, installation_id: int) -> str: - headers = { - "Authorization": f"Bearer {jwt_token}", - "Accept": "application/vnd.github.v3+json", - } - response = requests.post( - f"https://api.github.com/app/installations/{installation_id}/access_tokens", - headers=headers, - ) - response.raise_for_status() - data = response.json() - return data["token"] # type: ignore + main(get_cached_access_token(), True, True) def group_runners_by_tag( @@ -273,20 +233,10 @@ def delete_runner(access_token: str, runner: RunnerDescription) -> bool: def main( - github_secret_key: str, - github_app_id: int, + access_token: str, push_to_cloudwatch: bool, delete_offline_runners: bool, ) -> None: - payload = { - "iat": int(time.time()) - 60, - "exp": int(time.time()) + (10 * 60), - "iss": github_app_id, - } - - encoded_jwt = jwt.encode(payload, github_secret_key, algorithm="RS256") - installation_id = get_installation_id(encoded_jwt) - access_token = get_access_token(encoded_jwt, installation_id) gh_runners = list_runners(access_token) grouped_runners = group_runners_by_tag(gh_runners) for group, group_runners in grouped_runners.items(): @@ -354,4 +304,6 @@ if __name__ == "__main__": print("Attempt to get key and id from AWS secret manager") private_key, args.app_id = get_key_and_app_from_aws() - main(private_key, args.app_id, args.push_to_cloudwatch, args.delete_offline) + token = get_access_token_by_key_app(private_key, args.app_id) + + main(token, args.push_to_cloudwatch, args.delete_offline) diff --git a/tests/ci/ci_runners_metrics_lambda/requirements.txt b/tests/ci/ci_runners_metrics_lambda/requirements.txt index e99dee1743c..e2b16067a93 100644 --- a/tests/ci/ci_runners_metrics_lambda/requirements.txt +++ b/tests/ci/ci_runners_metrics_lambda/requirements.txt @@ -1,3 +1,2 @@ ../lambda_shared_package -PyJWT -cryptography<38 +../lambda_shared_package[token] diff --git a/tests/ci/lambda_shared_package/__init__.py b/tests/ci/lambda_shared_package/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/ci/lambda_shared_package/lambda_shared/pr.py b/tests/ci/lambda_shared_package/lambda_shared/pr.py new file mode 100644 index 00000000000..ef47eacc082 --- /dev/null +++ b/tests/ci/lambda_shared_package/lambda_shared/pr.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python + +import re +from typing import Tuple + +# Individual trusted contirbutors who are not in any trusted organization. +# Can be changed in runtime: we will append users that we learned to be in +# a trusted org, to save GitHub API calls. +TRUSTED_CONTRIBUTORS = { + e.lower() + for e in [ + "achimbab", + "adevyatova ", # DOCSUP + "Algunenano", # Raúl Marín, Tinybird + "amosbird", + "AnaUvarova", # DOCSUP + "anauvarova", # technical writer, Yandex + "annvsh", # technical writer, Yandex + "atereh", # DOCSUP + "azat", + "bharatnc", # Newbie, but already with many contributions. + "bobrik", # Seasoned contributor, CloudFlare + "BohuTANG", + "codyrobert", # Flickerbox engineer + "cwurm", # Employee + "damozhaeva", # DOCSUP + "den-crane", + "flickerbox-tom", # Flickerbox + "gyuton", # DOCSUP + "hagen1778", # Roman Khavronenko, seasoned contributor + "hczhcz", + "hexiaoting", # Seasoned contributor + "ildus", # adjust, ex-pgpro + "javisantana", # a Spanish ClickHouse enthusiast, ex-Carto + "ka1bi4", # DOCSUP + "kirillikoff", # DOCSUP + "kreuzerkrieg", + "lehasm", # DOCSUP + "michon470", # DOCSUP + "nikvas0", + "nvartolomei", + "olgarev", # DOCSUP + "otrazhenia", # Yandex docs contractor + "pdv-ru", # DOCSUP + "podshumok", # cmake expert from QRator Labs + "s-mx", # Maxim Sabyanin, former employee, present contributor + "sevirov", # technical writer, Yandex + "spongedu", # Seasoned contributor + "taiyang-li", + "ucasFL", # Amos Bird's friend + "vdimir", # Employee + "vzakaznikov", + "YiuRULE", + "zlobober", # Developer of YT + "ilejn", # Arenadata, responsible for Kerberized Kafka + "thomoco", # ClickHouse + "BoloniniD", # Seasoned contributor, HSE + "tonickkozlov", # Cloudflare + "tylerhannan", # ClickHouse Employee + "myrrc", # Mike Kot, DoubleCloud + "thevar1able", # ClickHouse Employee + "aalexfvk", + "MikhailBurdukov", + "tsolodov", # ClickHouse Employee + "kitaisreal", + ] +} + +# Descriptions are used in .github/PULL_REQUEST_TEMPLATE.md, keep comments there +# updated accordingly +# The following lists are append only, try to avoid editing them +# They still could be cleaned out after the decent time though. +LABELS = { + "pr-backward-incompatible": ["Backward Incompatible Change"], + "pr-bugfix": [ + "Bug Fix", + "Bug Fix (user-visible misbehavior in an official stable release)", + "Bug Fix (user-visible misbehaviour in official stable or prestable release)", + "Bug Fix (user-visible misbehavior in official stable or prestable release)", + ], + "pr-build": [ + "Build/Testing/Packaging Improvement", + "Build Improvement", + "Build/Testing Improvement", + "Build", + "Packaging Improvement", + ], + "pr-documentation": [ + "Documentation (changelog entry is not required)", + "Documentation", + ], + "pr-feature": ["New Feature"], + "pr-improvement": ["Improvement"], + "pr-not-for-changelog": [ + "Not for changelog (changelog entry is not required)", + "Not for changelog", + ], + "pr-performance": ["Performance Improvement"], +} + +CATEGORY_TO_LABEL = {c: lb for lb, categories in LABELS.items() for c in categories} + + +def check_pr_description(pr_body: str) -> Tuple[str, str]: + """The function checks the body to being properly formatted according to + .github/PULL_REQUEST_TEMPLATE.md, if the first returned string is not empty, + then there is an error.""" + lines = list(map(lambda x: x.strip(), pr_body.split("\n") if pr_body else [])) + lines = [re.sub(r"\s+", " ", line) for line in lines] + + # Check if body contains "Reverts ClickHouse/ClickHouse#36337" + if [ + True + for line in lines + if re.match(r"\AReverts {GITHUB_REPOSITORY}#[\d]+\Z", line) + ]: + return "", LABELS["pr-not-for-changelog"][0] + + category = "" + entry = "" + description_error = "" + + i = 0 + while i < len(lines): + if re.match(r"(?i)^[#>*_ ]*change\s*log\s*category", lines[i]): + i += 1 + if i >= len(lines): + break + # Can have one empty line between header and the category + # itself. Filter it out. + if not lines[i]: + i += 1 + if i >= len(lines): + break + category = re.sub(r"^[-*\s]*", "", lines[i]) + i += 1 + + # Should not have more than one category. Require empty line + # after the first found category. + if i >= len(lines): + break + if lines[i]: + second_category = re.sub(r"^[-*\s]*", "", lines[i]) + description_error = ( + "More than one changelog category specified: " + f"'{category}', '{second_category}'" + ) + return description_error, category + + elif re.match( + r"(?i)^[#>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i] + ): + i += 1 + # Can have one empty line between header and the entry itself. + # Filter it out. + if i < len(lines) and not lines[i]: + i += 1 + # All following lines until empty one are the changelog entry. + entry_lines = [] + while i < len(lines) and lines[i]: + entry_lines.append(lines[i]) + i += 1 + entry = " ".join(entry_lines) + # Don't accept changelog entries like '...'. + entry = re.sub(r"[#>*_.\- ]", "", entry) + # Don't accept changelog entries like 'Close #12345'. + entry = re.sub(r"^[\w\-\s]{0,10}#?\d{5,6}\.?$", "", entry) + else: + i += 1 + + if not category: + description_error = "Changelog category is empty" + # Filter out the PR categories that are not for changelog. + elif re.match( + r"(?i)doc|((non|in|not|un)[-\s]*significant)|(not[ ]*for[ ]*changelog)", + category, + ): + pass # to not check the rest of the conditions + elif category not in CATEGORY_TO_LABEL: + description_error, category = f"Category '{category}' is not valid", "" + elif not entry: + description_error = f"Changelog entry required for category '{category}'" + + return description_error, category diff --git a/tests/ci/lambda_shared_package/lambda_shared/token.py b/tests/ci/lambda_shared_package/lambda_shared/token.py new file mode 100644 index 00000000000..174ea4625a3 --- /dev/null +++ b/tests/ci/lambda_shared_package/lambda_shared/token.py @@ -0,0 +1,90 @@ +"""Module to get the token for GitHub""" +from dataclasses import dataclass +import json +import time +from typing import Tuple + +import boto3 # type: ignore +import jwt +import requests # type: ignore + + +def get_key_and_app_from_aws() -> Tuple[str, int]: + secret_name = "clickhouse_github_secret_key" + session = boto3.session.Session() + client = session.client( + service_name="secretsmanager", + ) + get_secret_value_response = client.get_secret_value(SecretId=secret_name) + data = json.loads(get_secret_value_response["SecretString"]) + return data["clickhouse-app-key"], int(data["clickhouse-app-id"]) + + +def get_installation_id(jwt_token: str) -> int: + headers = { + "Authorization": f"Bearer {jwt_token}", + "Accept": "application/vnd.github.v3+json", + } + response = requests.get("https://api.github.com/app/installations", headers=headers) + response.raise_for_status() + data = response.json() + for installation in data: + if installation["account"]["login"] == "ClickHouse": + installation_id = installation["id"] + + return installation_id # type: ignore + + +def get_access_token_by_jwt(jwt_token: str, installation_id: int) -> str: + headers = { + "Authorization": f"Bearer {jwt_token}", + "Accept": "application/vnd.github.v3+json", + } + response = requests.post( + f"https://api.github.com/app/installations/{installation_id}/access_tokens", + headers=headers, + ) + response.raise_for_status() + data = response.json() + return data["token"] # type: ignore + + +def get_token_from_aws() -> str: + private_key, app_id = get_key_and_app_from_aws() + return get_access_token_by_key_app(private_key, app_id) + + +def get_access_token_by_key_app(private_key: str, app_id: int) -> str: + payload = { + "iat": int(time.time()) - 60, + "exp": int(time.time()) + (10 * 60), + "iss": app_id, + } + + encoded_jwt = jwt.encode(payload, private_key, algorithm="RS256") + installation_id = get_installation_id(encoded_jwt) + return get_access_token_by_jwt(encoded_jwt, installation_id) + + +@dataclass +class CachedToken: + time: int + value: str + updating: bool = False + + +_cached_token = CachedToken(0, "") + + +def get_cached_access_token() -> str: + if time.time() - 550 < _cached_token.time or _cached_token.updating: + return _cached_token.value + # Indicate that the value is updating now, so the cached value can be + # used. The first setting and close-to-ttl are not counted as update + if _cached_token.time != 0 or time.time() - 590 < _cached_token.time: + _cached_token.updating = True + private_key, app_id = get_key_and_app_from_aws() + _cached_token.time = int(time.time()) + _cached_token.value = get_access_token_by_key_app(private_key, app_id) + _cached_token.updating = False + return _cached_token.value diff --git a/tests/ci/lambda_shared_package/pyproject.toml b/tests/ci/lambda_shared_package/pyproject.toml index 8b4b0a80948..bbf74cc0649 100644 --- a/tests/ci/lambda_shared_package/pyproject.toml +++ b/tests/ci/lambda_shared_package/pyproject.toml @@ -9,5 +9,15 @@ dependencies = [ "requests < 2.30", ] +[project.optional-dependencies] +token = [ + "PyJWT", + "cryptography<38", +] +dev = [ + "boto3", + "lambda_shared[token]", +] + [tool.distutils.bdist_wheel] universal = true diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 9849f19a1e4..330a1309016 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -20,9 +20,11 @@ from docs_check import NAME as DOCS_NAME from env_helper import GITHUB_REPOSITORY, GITHUB_SERVER_URL from get_robot_token import get_best_robot_token from pr_info import FORCE_TESTS_LABEL, PRInfo - -from cancel_and_rerun_workflow_lambda.app import CATEGORY_TO_LABEL, check_pr_description -from workflow_approve_rerun_lambda.app import TRUSTED_CONTRIBUTORS +from lambda_shared_package.lambda_shared.pr import ( + CATEGORY_TO_LABEL, + TRUSTED_CONTRIBUTORS, + check_pr_description, +) TRUSTED_ORG_IDS = { 54801242, # clickhouse diff --git a/tests/ci/runner_token_rotation_lambda/app.py b/tests/ci/runner_token_rotation_lambda/app.py index 70ee5da01f4..6544eee9581 100644 --- a/tests/ci/runner_token_rotation_lambda/app.py +++ b/tests/ci/runner_token_rotation_lambda/app.py @@ -2,40 +2,11 @@ import argparse import sys -import json -import time import boto3 # type: ignore -import jwt import requests # type: ignore - -def get_installation_id(jwt_token): - headers = { - "Authorization": f"Bearer {jwt_token}", - "Accept": "application/vnd.github.v3+json", - } - response = requests.get("https://api.github.com/app/installations", headers=headers) - response.raise_for_status() - data = response.json() - for installation in data: - if installation["account"]["login"] == "ClickHouse": - installation_id = installation["id"] - return installation_id - - -def get_access_token(jwt_token, installation_id): - headers = { - "Authorization": f"Bearer {jwt_token}", - "Accept": "application/vnd.github.v3+json", - } - response = requests.post( - f"https://api.github.com/app/installations/{installation_id}/access_tokens", - headers=headers, - ) - response.raise_for_status() - data = response.json() - return data["token"] +from lambda_shared.token import get_cached_access_token, get_access_token_by_key_app def get_runner_registration_token(access_token): @@ -52,32 +23,10 @@ def get_runner_registration_token(access_token): return data["token"] -def get_key_and_app_from_aws(): - secret_name = "clickhouse_github_secret_key" - session = boto3.session.Session() - client = session.client( - service_name="secretsmanager", - ) - get_secret_value_response = client.get_secret_value(SecretId=secret_name) - data = json.loads(get_secret_value_response["SecretString"]) - return data["clickhouse-app-key"], int(data["clickhouse-app-id"]) - - -def main(github_secret_key, github_app_id, push_to_ssm, ssm_parameter_name): - payload = { - "iat": int(time.time()) - 60, - "exp": int(time.time()) + (10 * 60), - "iss": github_app_id, - } - - encoded_jwt = jwt.encode(payload, github_secret_key, algorithm="RS256") - installation_id = get_installation_id(encoded_jwt) - access_token = get_access_token(encoded_jwt, installation_id) +def main(access_token, push_to_ssm, ssm_parameter_name): runner_registration_token = get_runner_registration_token(access_token) if push_to_ssm: - import boto3 - print("Trying to put params into ssm manager") client = boto3.client("ssm") client.put_parameter( @@ -94,8 +43,7 @@ def main(github_secret_key, github_app_id, push_to_ssm, ssm_parameter_name): def handler(event, context): - private_key, app_id = get_key_and_app_from_aws() - main(private_key, app_id, True, "github_runner_registration_token") + main(get_cached_access_token(), True, "github_runner_registration_token") if __name__ == "__main__": @@ -140,4 +88,5 @@ if __name__ == "__main__": with open(args.private_key_path, "r") as key_file: private_key = key_file.read() - main(private_key, args.app_id, args.push_to_ssm, args.ssm_parameter_name) + token = get_access_token_by_key_app(private_key, args.app_id) + main(token, args.push_to_ssm, args.ssm_parameter_name) diff --git a/tests/ci/runner_token_rotation_lambda/lambda_shared b/tests/ci/runner_token_rotation_lambda/lambda_shared new file mode 120000 index 00000000000..ba86e090f6c --- /dev/null +++ b/tests/ci/runner_token_rotation_lambda/lambda_shared @@ -0,0 +1 @@ +../lambda_shared_package/lambda_shared \ No newline at end of file diff --git a/tests/ci/runner_token_rotation_lambda/requirements.txt b/tests/ci/runner_token_rotation_lambda/requirements.txt index 98be09ab232..4cb3fba0f7b 100644 --- a/tests/ci/runner_token_rotation_lambda/requirements.txt +++ b/tests/ci/runner_token_rotation_lambda/requirements.txt @@ -1,3 +1 @@ -requests<2.30 -PyJWT -cryptography<38 +../lambda_shared_package[token] diff --git a/tests/ci/team_keys_lambda/app.py b/tests/ci/team_keys_lambda/app.py index 870d41c441e..f562fbe101d 100644 --- a/tests/ci/team_keys_lambda/app.py +++ b/tests/ci/team_keys_lambda/app.py @@ -81,6 +81,8 @@ def get_cached_members_keys(members: set) -> Keys: def get_token_from_aws() -> str: + # We need a separate token, since the clickhouse-ci app does not have + # access to the organization members' endpoint secret_name = "clickhouse_robot_token" session = boto3.session.Session() client = session.client( @@ -130,4 +132,4 @@ if __name__ == "__main__": args = parser.parse_args() output = main(args.token, args.organization, args.team) - print(f"# Just shoing off the keys:\n{output}") + print(f"# Just showing off the keys:\n{output}") diff --git a/tests/ci/team_keys_lambda/lambda_shared b/tests/ci/team_keys_lambda/lambda_shared new file mode 120000 index 00000000000..ba86e090f6c --- /dev/null +++ b/tests/ci/team_keys_lambda/lambda_shared @@ -0,0 +1 @@ +../lambda_shared_package/lambda_shared \ No newline at end of file diff --git a/tests/ci/team_keys_lambda/requirements.txt b/tests/ci/team_keys_lambda/requirements.txt index 3bcbe2dfd07..098e04a9798 100644 --- a/tests/ci/team_keys_lambda/requirements.txt +++ b/tests/ci/team_keys_lambda/requirements.txt @@ -1 +1 @@ -requests<2.30 +../lambda_shared_package diff --git a/tests/ci/terminate_runner_lambda/app.py b/tests/ci/terminate_runner_lambda/app.py index 5799a498d5a..98b14508314 100644 --- a/tests/ci/terminate_runner_lambda/app.py +++ b/tests/ci/terminate_runner_lambda/app.py @@ -5,86 +5,12 @@ import json import sys import time from dataclasses import dataclass -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List import boto3 # type: ignore -import requests # type: ignore -import jwt from lambda_shared import RunnerDescriptions, list_runners - - -def get_key_and_app_from_aws() -> Tuple[str, int]: - secret_name = "clickhouse_github_secret_key" - session = boto3.session.Session() - client = session.client( - service_name="secretsmanager", - ) - get_secret_value_response = client.get_secret_value(SecretId=secret_name) - data = json.loads(get_secret_value_response["SecretString"]) - return data["clickhouse-app-key"], int(data["clickhouse-app-id"]) - - -def get_installation_id(jwt_token: str) -> int: - headers = { - "Authorization": f"Bearer {jwt_token}", - "Accept": "application/vnd.github.v3+json", - } - response = requests.get("https://api.github.com/app/installations", headers=headers) - response.raise_for_status() - data = response.json() - for installation in data: - if installation["account"]["login"] == "ClickHouse": - installation_id = installation["id"] - break - - return installation_id # type: ignore - - -def get_access_token(jwt_token: str, installation_id: int) -> str: - headers = { - "Authorization": f"Bearer {jwt_token}", - "Accept": "application/vnd.github.v3+json", - } - response = requests.post( - f"https://api.github.com/app/installations/{installation_id}/access_tokens", - headers=headers, - ) - response.raise_for_status() - data = response.json() - return data["token"] # type: ignore - - -@dataclass -class CachedToken: - time: int - value: str - updating: bool = False - - -cached_token = CachedToken(0, "") - - -def get_cached_access_token() -> str: - if time.time() - 550 < cached_token.time or cached_token.updating: - return cached_token.value - # Indicate that the value is updating now, so the cached value can be - # used. The first setting and close-to-ttl are not counted as update - if cached_token.time != 0 or time.time() - 590 < cached_token.time: - cached_token.updating = True - private_key, app_id = get_key_and_app_from_aws() - payload = { - "iat": int(time.time()) - 60, - "exp": int(time.time()) + (10 * 60), - "iss": app_id, - } - - encoded_jwt = jwt.encode(payload, private_key, algorithm="RS256") - installation_id = get_installation_id(encoded_jwt) - cached_token.time = int(time.time()) - cached_token.value = get_access_token(encoded_jwt, installation_id) - cached_token.updating = False - return cached_token.value +from lambda_shared.token import get_access_token_by_key_app, get_cached_access_token @dataclass @@ -284,6 +210,8 @@ if __name__ == "__main__": with open(args.private_key_path, "r") as key_file: private_key = key_file.read() + token = get_access_token_by_key_app(private_key, args.app_id) + sample_event = { "AutoScalingGroupARN": "arn:aws:autoscaling:us-east-1::autoScalingGroup:d4738357-2d40-4038-ae7e-b00ae0227003:autoScalingGroupName/my-asg", "AutoScalingGroupName": "my-asg", @@ -328,14 +256,4 @@ if __name__ == "__main__": "Cause": "SCALE_IN", } - payload = { - "iat": int(time.time()) - 60, - "exp": int(time.time()) + (10 * 60), - "iss": args.app_id, - } - - encoded_jwt = jwt.encode(payload, private_key, algorithm="RS256") - installation_id = get_installation_id(encoded_jwt) - access_token = get_access_token(encoded_jwt, args.app_id) - - main(access_token, sample_event) + main(token, sample_event) diff --git a/tests/ci/terminate_runner_lambda/requirements.txt b/tests/ci/terminate_runner_lambda/requirements.txt index e99dee1743c..4cb3fba0f7b 100644 --- a/tests/ci/terminate_runner_lambda/requirements.txt +++ b/tests/ci/terminate_runner_lambda/requirements.txt @@ -1,3 +1 @@ -../lambda_shared_package -PyJWT -cryptography<38 +../lambda_shared_package[token] diff --git a/tests/ci/workflow_approve_rerun_lambda/app.py b/tests/ci/workflow_approve_rerun_lambda/app.py index 32cba5d466b..3db62430d85 100644 --- a/tests/ci/workflow_approve_rerun_lambda/app.py +++ b/tests/ci/workflow_approve_rerun_lambda/app.py @@ -5,9 +5,10 @@ import fnmatch import json import time -import jwt import requests # type: ignore -import boto3 # type: ignore + +from lambda_shared.pr import TRUSTED_CONTRIBUTORS +from lambda_shared.token import get_cached_access_token SUSPICIOUS_CHANGED_FILES_NUMBER = 200 @@ -67,108 +68,6 @@ NEED_RERUN_WORKFLOWS = { "ReleaseBranchCI", } -# Individual trusted contirbutors who are not in any trusted organization. -# Can be changed in runtime: we will append users that we learned to be in -# a trusted org, to save GitHub API calls. -TRUSTED_CONTRIBUTORS = { - e.lower() - for e in [ - "achimbab", - "adevyatova ", # DOCSUP - "Algunenano", # Raúl Marín, Tinybird - "amosbird", - "AnaUvarova", # DOCSUP - "anauvarova", # technical writer, Yandex - "annvsh", # technical writer, Yandex - "atereh", # DOCSUP - "azat", - "bharatnc", # Newbie, but already with many contributions. - "bobrik", # Seasoned contributor, CloudFlare - "BohuTANG", - "codyrobert", # Flickerbox engineer - "cwurm", # Employee - "damozhaeva", # DOCSUP - "den-crane", - "flickerbox-tom", # Flickerbox - "gyuton", # DOCSUP - "hagen1778", # Roman Khavronenko, seasoned contributor - "hczhcz", - "hexiaoting", # Seasoned contributor - "ildus", # adjust, ex-pgpro - "javisantana", # a Spanish ClickHouse enthusiast, ex-Carto - "ka1bi4", # DOCSUP - "kirillikoff", # DOCSUP - "kreuzerkrieg", - "lehasm", # DOCSUP - "michon470", # DOCSUP - "nikvas0", - "nvartolomei", - "olgarev", # DOCSUP - "otrazhenia", # Yandex docs contractor - "pdv-ru", # DOCSUP - "podshumok", # cmake expert from QRator Labs - "s-mx", # Maxim Sabyanin, former employee, present contributor - "sevirov", # technical writer, Yandex - "spongedu", # Seasoned contributor - "taiyang-li", - "ucasFL", # Amos Bird's friend - "vdimir", # Employee - "vzakaznikov", - "YiuRULE", - "zlobober", # Developer of YT - "ilejn", # Arenadata, responsible for Kerberized Kafka - "thomoco", # ClickHouse - "BoloniniD", # Seasoned contributor, HSE - "tonickkozlov", # Cloudflare - "tylerhannan", # ClickHouse Employee - "myrrc", # Mike Kot, DoubleCloud - "thevar1able", # ClickHouse Employee - "aalexfvk", - "MikhailBurdukov", - "tsolodov", # ClickHouse Employee - "kitaisreal", - ] -} - - -def get_installation_id(jwt_token): - headers = { - "Authorization": f"Bearer {jwt_token}", - "Accept": "application/vnd.github.v3+json", - } - response = requests.get("https://api.github.com/app/installations", headers=headers) - response.raise_for_status() - data = response.json() - for installation in data: - if installation["account"]["login"] == "ClickHouse": - installation_id = installation["id"] - return installation_id - - -def get_access_token(jwt_token, installation_id): - headers = { - "Authorization": f"Bearer {jwt_token}", - "Accept": "application/vnd.github.v3+json", - } - response = requests.post( - f"https://api.github.com/app/installations/{installation_id}/access_tokens", - headers=headers, - ) - response.raise_for_status() - data = response.json() - return data["token"] - - -def get_key_and_app_from_aws(): - secret_name = "clickhouse_github_secret_key" - session = boto3.session.Session() - client = session.client( - service_name="secretsmanager", - ) - get_secret_value_response = client.get_secret_value(SecretId=secret_name) - data = json.loads(get_secret_value_response["SecretString"]) - return data["clickhouse-app-key"], int(data["clickhouse-app-id"]) - def is_trusted_contributor(pr_user_login, pr_user_orgs): if pr_user_login.lower() in TRUSTED_CONTRIBUTORS: @@ -331,19 +230,6 @@ def label_manual_approve(pull_request, token): _exec_post_with_retry(url, token, data) -def get_token_from_aws(): - private_key, app_id = get_key_and_app_from_aws() - payload = { - "iat": int(time.time()) - 60, - "exp": int(time.time()) + (10 * 60), - "iss": app_id, - } - - encoded_jwt = jwt.encode(payload, private_key, algorithm="RS256") - installation_id = get_installation_id(encoded_jwt) - return get_access_token(encoded_jwt, installation_id) - - def get_workflow_jobs(workflow_description, token): jobs_url = ( workflow_description.api_url + f"/attempts/{workflow_description.attempt}/jobs" @@ -443,7 +329,7 @@ def check_workflow_completed( def main(event): - token = get_token_from_aws() + token = get_cached_access_token() event_data = json.loads(event["body"]) print("The body received:", event["body"]) workflow_description = get_workflow_description_from_event(event_data) diff --git a/tests/ci/workflow_approve_rerun_lambda/lambda_shared b/tests/ci/workflow_approve_rerun_lambda/lambda_shared new file mode 120000 index 00000000000..ba86e090f6c --- /dev/null +++ b/tests/ci/workflow_approve_rerun_lambda/lambda_shared @@ -0,0 +1 @@ +../lambda_shared_package/lambda_shared \ No newline at end of file diff --git a/tests/ci/workflow_approve_rerun_lambda/requirements.txt b/tests/ci/workflow_approve_rerun_lambda/requirements.txt index 98be09ab232..4cb3fba0f7b 100644 --- a/tests/ci/workflow_approve_rerun_lambda/requirements.txt +++ b/tests/ci/workflow_approve_rerun_lambda/requirements.txt @@ -1,3 +1 @@ -requests<2.30 -PyJWT -cryptography<38 +../lambda_shared_package[token] diff --git a/tests/ci/workflow_jobs_lambda/lambda_shared b/tests/ci/workflow_jobs_lambda/lambda_shared new file mode 120000 index 00000000000..ba86e090f6c --- /dev/null +++ b/tests/ci/workflow_jobs_lambda/lambda_shared @@ -0,0 +1 @@ +../lambda_shared_package/lambda_shared \ No newline at end of file From 2dca0eac1b5024e97f8e36889d8d39f43e8e4c2b Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 23 May 2023 21:51:35 +0200 Subject: [PATCH 0251/1072] Delete __init__.py in lambda directories to break subpackage --- tests/ci/cancel_and_rerun_workflow_lambda/__init__.py | 0 tests/ci/runner_token_rotation_lambda/__init__.py | 0 tests/ci/team_keys_lambda/__init__.py | 0 tests/ci/terminate_runner_lambda/__init__.py | 0 tests/ci/workflow_approve_rerun_lambda/__init__.py | 1 - 5 files changed, 1 deletion(-) delete mode 100644 tests/ci/cancel_and_rerun_workflow_lambda/__init__.py delete mode 100644 tests/ci/runner_token_rotation_lambda/__init__.py delete mode 100644 tests/ci/team_keys_lambda/__init__.py delete mode 100644 tests/ci/terminate_runner_lambda/__init__.py delete mode 100644 tests/ci/workflow_approve_rerun_lambda/__init__.py diff --git a/tests/ci/cancel_and_rerun_workflow_lambda/__init__.py b/tests/ci/cancel_and_rerun_workflow_lambda/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/ci/runner_token_rotation_lambda/__init__.py b/tests/ci/runner_token_rotation_lambda/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/ci/team_keys_lambda/__init__.py b/tests/ci/team_keys_lambda/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/ci/terminate_runner_lambda/__init__.py b/tests/ci/terminate_runner_lambda/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/ci/workflow_approve_rerun_lambda/__init__.py b/tests/ci/workflow_approve_rerun_lambda/__init__.py deleted file mode 100644 index 4265cc3e6c1..00000000000 --- a/tests/ci/workflow_approve_rerun_lambda/__init__.py +++ /dev/null @@ -1 +0,0 @@ -#!/usr/bin/env python From e8b03d74986a4e0f51f9cd064493cd5419c78add Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Fri, 26 May 2023 17:17:49 +0200 Subject: [PATCH 0252/1072] Move insert part for ClickHouseHelper to shared --- .../lambda_shared/__init__.py | 91 +++++++++- tests/ci/workflow_jobs_lambda/app.py | 165 +++--------------- .../ci/workflow_jobs_lambda/requirements.txt | 2 +- 3 files changed, 110 insertions(+), 148 deletions(-) diff --git a/tests/ci/lambda_shared_package/lambda_shared/__init__.py b/tests/ci/lambda_shared_package/lambda_shared/__init__.py index fe52f98d5f6..534d7773ddd 100644 --- a/tests/ci/lambda_shared_package/lambda_shared/__init__.py +++ b/tests/ci/lambda_shared_package/lambda_shared/__init__.py @@ -5,7 +5,7 @@ import json import logging import time from collections import namedtuple -from typing import Any, List, Optional +from typing import Any, Dict, Iterable, List, Optional import boto3 # type: ignore import requests # type: ignore @@ -36,10 +36,14 @@ class CHException(Exception): pass +class InsertException(CHException): + pass + + class ClickHouseHelper: def __init__( self, - url: Optional[str] = None, + url: str, user: Optional[str] = None, password: Optional[str] = None, ): @@ -50,6 +54,89 @@ class ClickHouseHelper: if password: self.auth["X-ClickHouse-Key"] = password + @staticmethod + def _insert_json_str_info_impl( + url: str, auth: Dict[str, str], db: str, table: str, json_str: str + ) -> None: + params = { + "database": db, + "query": f"INSERT INTO {table} FORMAT JSONEachRow", + "date_time_input_format": "best_effort", + "send_logs_level": "warning", + } + + for i in range(5): + try: + response = requests.post( + url, params=params, data=json_str, headers=auth + ) + except Exception as e: + error = f"Received exception while sending data to {url} on {i} attempt: {e}" + logging.warning(error) + continue + + logging.info("Response content '%s'", response.content) + + if response.ok: + break + + error = ( + "Cannot insert data into clickhouse at try " + + str(i) + + ": HTTP code " + + str(response.status_code) + + ": '" + + str(response.text) + + "'" + ) + + if response.status_code >= 500: + # A retriable error + time.sleep(1) + continue + + logging.info( + "Request headers '%s', body '%s'", + response.request.headers, + response.request.body, + ) + + raise InsertException(error) + else: + raise InsertException(error) + + def _insert_json_str_info(self, db: str, table: str, json_str: str) -> None: + self._insert_json_str_info_impl(self.url, self.auth, db, table, json_str) + + def insert_event_into( + self, db: str, table: str, event: object, safe: bool = True + ) -> None: + event_str = json.dumps(event) + try: + self._insert_json_str_info(db, table, event_str) + except InsertException as e: + logging.error( + "Exception happened during inserting data into clickhouse: %s", e + ) + if not safe: + raise + + def insert_events_into( + self, db: str, table: str, events: Iterable[object], safe: bool = True + ) -> None: + jsons = [] + for event in events: + jsons.append(json.dumps(event)) + + try: + self._insert_json_str_info(db, table, ",".join(jsons)) + except InsertException as e: + logging.error( + "Exception happened during inserting data into clickhouse: %s", e + ) + if not safe: + raise + def _select_and_get_json_each_row(self, db: str, query: str) -> str: params = { "database": db, diff --git a/tests/ci/workflow_jobs_lambda/app.py b/tests/ci/workflow_jobs_lambda/app.py index c4ce68c3f8e..c624a492604 100644 --- a/tests/ci/workflow_jobs_lambda/app.py +++ b/tests/ci/workflow_jobs_lambda/app.py @@ -10,13 +10,11 @@ fields for private repositories from base64 import b64decode from dataclasses import dataclass -from typing import Any, List +from typing import Any, List, Optional import json import logging -import time -import boto3 # type: ignore -import requests # type: ignore +from lambda_shared import ClickHouseHelper, InsertException, get_parameter_from_ssm logging.getLogger().setLevel(logging.INFO) @@ -66,137 +64,7 @@ class WorkflowJob: return self.__dict__ -### VENDORING -def get_parameter_from_ssm(name, decrypt=True, client=None): - if not client: - client = boto3.client("ssm", region_name="us-east-1") - return client.get_parameter(Name=name, WithDecryption=decrypt)["Parameter"]["Value"] - - -class InsertException(Exception): - pass - - -class ClickHouseHelper: - def __init__(self, url=None): - if url is None: - url = get_parameter_from_ssm("clickhouse-test-stat-url") - - self.url = url - self.auth = { - "X-ClickHouse-User": get_parameter_from_ssm("clickhouse-test-stat-login"), - "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password"), - } - - @staticmethod - def _insert_json_str_info_impl(url, auth, db, table, json_str): - params = { - "database": db, - "query": f"INSERT INTO {table} FORMAT JSONEachRow", - "date_time_input_format": "best_effort", - "send_logs_level": "warning", - } - - for i in range(5): - try: - response = requests.post( - url, params=params, data=json_str, headers=auth - ) - except Exception as e: - error = f"Received exception while sending data to {url} on {i} attempt: {e}" - logging.warning(error) - continue - - logging.info("Response content '%s'", response.content) - - if response.ok: - break - - error = ( - "Cannot insert data into clickhouse at try " - + str(i) - + ": HTTP code " - + str(response.status_code) - + ": '" - + str(response.text) - + "'" - ) - - if response.status_code >= 500: - # A retriable error - time.sleep(1) - continue - - logging.info( - "Request headers '%s', body '%s'", - response.request.headers, - response.request.body, - ) - - raise InsertException(error) - else: - raise InsertException(error) - - def _insert_json_str_info(self, db, table, json_str): - self._insert_json_str_info_impl(self.url, self.auth, db, table, json_str) - - def insert_event_into(self, db, table, event, safe=True): - event_str = json.dumps(event) - try: - self._insert_json_str_info(db, table, event_str) - except InsertException as e: - logging.error( - "Exception happened during inserting data into clickhouse: %s", e - ) - if not safe: - raise - - def insert_events_into(self, db, table, events, safe=True): - jsons = [] - for event in events: - jsons.append(json.dumps(event)) - - try: - self._insert_json_str_info(db, table, ",".join(jsons)) - except InsertException as e: - logging.error( - "Exception happened during inserting data into clickhouse: %s", e - ) - if not safe: - raise - - def _select_and_get_json_each_row(self, db, query): - params = { - "database": db, - "query": query, - "default_format": "JSONEachRow", - } - for i in range(5): - response = None - try: - response = requests.get(self.url, params=params, headers=self.auth) - response.raise_for_status() - return response.text - except Exception as ex: - logging.warning("Cannot insert with exception %s", str(ex)) - if response: - logging.warning("Reponse text %s", response.text) - time.sleep(0.1 * i) - - raise Exception("Cannot fetch data from clickhouse") - - def select_json_each_row(self, db, query): - text = self._select_and_get_json_each_row(db, query) - result = [] - for line in text.split("\n"): - if line: - result.append(json.loads(line)) - return result - - -### VENDORING END - -clickhouse_client = ClickHouseHelper() +CH_CLIENT = None # type: Optional[ClickHouseHelper] def send_event_workflow_job(workflow_job: WorkflowJob) -> None: @@ -232,23 +100,30 @@ def send_event_workflow_job(workflow_job: WorkflowJob) -> None: # PARTITION BY toStartOfMonth(started_at) # ORDER BY (id, updated_at) # SETTINGS index_granularity = 8192 - global clickhouse_client - kwargs = { - "db": "default", - "table": "workflow_jobs", - "event": workflow_job.as_dict(), - "safe": False, - } + global CH_CLIENT + CH_CLIENT = CH_CLIENT or ClickHouseHelper( + get_parameter_from_ssm("clickhouse-test-stat-url"), + get_parameter_from_ssm("clickhouse-test-stat-login"), + get_parameter_from_ssm("clickhouse-test-stat-password"), + ) try: - clickhouse_client.insert_event_into(**kwargs) + CH_CLIENT.insert_event_into( + "default", "workflow_jobs", workflow_job.as_dict(), False + ) except InsertException as ex: logging.exception( "Got an exception on insert, tryuing to update the client " "credentials and repeat", exc_info=ex, ) - clickhouse_client = ClickHouseHelper() - clickhouse_client.insert_event_into(**kwargs) + CH_CLIENT = ClickHouseHelper( + get_parameter_from_ssm("clickhouse-test-stat-url"), + get_parameter_from_ssm("clickhouse-test-stat-login"), + get_parameter_from_ssm("clickhouse-test-stat-password"), + ) + CH_CLIENT.insert_event_into( + "default", "workflow_jobs", workflow_job.as_dict(), False + ) def handler(event: dict, context: Any) -> dict: diff --git a/tests/ci/workflow_jobs_lambda/requirements.txt b/tests/ci/workflow_jobs_lambda/requirements.txt index 3bcbe2dfd07..098e04a9798 100644 --- a/tests/ci/workflow_jobs_lambda/requirements.txt +++ b/tests/ci/workflow_jobs_lambda/requirements.txt @@ -1 +1 @@ -requests<2.30 +../lambda_shared_package From f62faaedc3c837b009479cb971e868526a6464fa Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 1 Jun 2023 21:20:39 +0200 Subject: [PATCH 0253/1072] paranoid fix for removing parts from zk --- .../MergeTree/ReplicatedMergeTreeSink.cpp | 19 +- .../MergeTree/ReplicatedMergeTreeSink.h | 2 +- src/Storages/StorageReplicatedMergeTree.cpp | 171 +++++++----------- src/Storages/StorageReplicatedMergeTree.h | 1 + ...tem_parts_race_condition_zookeeper_long.sh | 2 + .../0_stateless/01154_move_partition_long.sh | 2 + 6 files changed, 86 insertions(+), 111 deletions(-) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index a38e9eba844..28dad454afe 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -532,12 +532,12 @@ void ReplicatedMergeTreeSinkImpl::finishDelayedChunk(const ZooKeeperWithF try { - commitPart(zookeeper, part, partition.block_id, delayed_chunk->replicas_num, false); + bool deduplicated = commitPart(zookeeper, part, partition.block_id, delayed_chunk->replicas_num, false).second; - last_block_is_duplicate = last_block_is_duplicate || part->is_duplicate; + last_block_is_duplicate = last_block_is_duplicate || deduplicated; /// Set a special error code if the block is duplicate - int error = (deduplicate && part->is_duplicate) ? ErrorCodes::INSERT_WAS_DEDUPLICATED : 0; + int error = (deduplicate && deduplicated) ? ErrorCodes::INSERT_WAS_DEDUPLICATED : 0; auto counters_snapshot = std::make_shared(partition.part_counters.getPartiallyAtomicSnapshot()); PartLog::addNewPart(storage.getContext(), PartLog::PartLogEntry(part, partition.elapsed_ns, counters_snapshot), ExecutionStatus(error)); storage.incrementInsertedPartsProfileEvent(part->getType()); @@ -575,7 +575,7 @@ void ReplicatedMergeTreeSinkImpl::finishDelayedChunk(const ZooKeeperWithFa while (true) { partition.temp_part.finalize(); - auto conflict_block_ids = commitPart(zookeeper, partition.temp_part.part, partition.block_id, delayed_chunk->replicas_num, false); + auto conflict_block_ids = commitPart(zookeeper, partition.temp_part.part, partition.block_id, delayed_chunk->replicas_num, false).first; if (conflict_block_ids.empty()) break; ++retry_times; @@ -620,7 +620,7 @@ void ReplicatedMergeTreeSinkImpl::writeExistingPart(MergeTreeData: } template -std::vector ReplicatedMergeTreeSinkImpl::commitPart( +std::pair, bool> ReplicatedMergeTreeSinkImpl::commitPart( const ZooKeeperWithFaultInjectionPtr & zookeeper, MergeTreeData::MutableDataPartPtr & part, const BlockIDsType & block_id, @@ -644,6 +644,7 @@ std::vector ReplicatedMergeTreeSinkImpl::commitPart( /// for retries due to keeper error bool part_committed_locally_but_zookeeper = false; + bool part_was_deduplicated = false; Coordination::Error write_part_info_keeper_error = Coordination::Error::ZOK; std::vector conflict_block_ids; @@ -844,7 +845,7 @@ std::vector ReplicatedMergeTreeSinkImpl::commitPart( /// If it exists on our replica, ignore it. if (storage.getActiveContainingPart(existing_part_name)) { - part->is_duplicate = true; + part_was_deduplicated = true; ProfileEvents::increment(ProfileEvents::DuplicatedInsertedBlocks); if (isQuorumEnabled()) { @@ -1040,7 +1041,7 @@ std::vector ReplicatedMergeTreeSinkImpl::commitPart( ++loop_counter; if (loop_counter == max_iterations) { - part->is_duplicate = true; /// Part is duplicate, just remove it from local FS + part_was_deduplicated = true; /// Part is duplicate, just remove it from local FS throw Exception(ErrorCodes::DUPLICATE_DATA_PART, "Too many transaction retries - it may indicate an error"); } retries_ctl.requestUnconditionalRetry(); /// we want one more iteration w/o counting it as a try and timeout @@ -1093,7 +1094,7 @@ std::vector ReplicatedMergeTreeSinkImpl::commitPart( [&zookeeper]() { zookeeper->cleanupEphemeralNodes(); }); if (!conflict_block_ids.empty()) - return conflict_block_ids; + return {conflict_block_ids, part_was_deduplicated}; if (isQuorumEnabled()) { @@ -1129,7 +1130,7 @@ std::vector ReplicatedMergeTreeSinkImpl::commitPart( return; }); } - return {}; + return {conflict_block_ids, part_was_deduplicated}; } template diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h index 3777a9f7285..3efd364fc9c 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h @@ -87,7 +87,7 @@ private: size_t checkQuorumPrecondition(const ZooKeeperWithFaultInjectionPtr & zookeeper); /// Rename temporary part and commit to ZooKeeper. - std::vector commitPart( + std::pair, bool> commitPart( const ZooKeeperWithFaultInjectionPtr & zookeeper, MergeTreeData::MutableDataPartPtr & part, const BlockIDsType & block_id, diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 0f79e9f8f19..44403fc708b 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -1992,6 +1993,16 @@ void StorageReplicatedMergeTree::executeDropRange(const LogEntry & entry) /// Forcibly remove parts from ZooKeeper removePartsFromZooKeeperWithRetries(parts_to_remove); +#ifdef ABORT_ON_LOGICAL_ERROR + Strings parts_remain = getZooKeeper()->getChildren(replica_path + "/parts"); + for (const auto & part_name : parts_remain) + { + auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version); + if (drop_range_info.contains(part_info)) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Part {} remains in ZooKeeper after DROP_RANGE {}", part_name, entry.new_part_name); + } +#endif + if (entry.detach) LOG_DEBUG(log, "Detached {} parts inside {}.", parts_to_remove.size(), entry.new_part_name); else @@ -6634,8 +6645,7 @@ bool StorageReplicatedMergeTree::hasLightweightDeletedMask() const void StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZK() { - auto table_lock = lockForShare( - RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations); + auto table_lock = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations); auto zookeeper = getZooKeeper(); /// Now these parts are in Deleting state. If we fail to remove some of them we must roll them back to Outdated state. @@ -6644,6 +6654,12 @@ void StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZK() if (parts.empty()) return; + NOEXCEPT_SCOPE({ clearOldPartsAndRemoveFromZKImpl(zookeeper, std::move(parts)); }); +} + +void StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZKImpl(zkutil::ZooKeeperPtr zookeeper, DataPartsVector && parts) +{ + DataPartsVector parts_to_delete_only_from_filesystem; // Only duplicates DataPartsVector parts_to_delete_completely; // All parts except duplicates DataPartsVector parts_to_retry_deletion; // Parts that should be retried due to network problems @@ -6654,7 +6670,11 @@ void StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZK() /// Broken part can be removed from zk by removePartAndEnqueueFetch(...) only. /// Removal without enqueueing a fetch leads to intersecting parts. if (part->is_duplicate || part->outdated_because_broken) + { + LOG_WARNING(log, "Will not remove part {} from ZooKeeper (is_duplicate: {}, outdated_because_broken: {})", + part->name, part->is_duplicate, part->outdated_because_broken); parts_to_delete_only_from_filesystem.emplace_back(part); + } else parts_to_delete_completely.emplace_back(part); } @@ -6680,7 +6700,7 @@ void StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZK() if (!rollback_parts.empty()) rollbackDeletingParts(rollback_parts); } - else /// all parts was successfully removed + else /// all parts were successfully removed { finally_remove_parts = parts_to_delete; } @@ -6764,114 +6784,57 @@ void StorageReplicatedMergeTree::removePartsFromZooKeeperWithRetries(PartsToRemo void StorageReplicatedMergeTree::removePartsFromZooKeeperWithRetries(const Strings & part_names, size_t max_retries) { + auto zookeeper = getZooKeeper(); + NameSet parts_to_retry_set; + removePartsFromZooKeeper(zookeeper, part_names, &parts_to_retry_set); + size_t num_tries = 0; - bool success = false; - - while (!success && (max_retries == 0 || num_tries < max_retries)) + while (!parts_to_retry_set.empty() && (max_retries == 0 || num_tries < max_retries)) { - try - { - ++num_tries; - success = true; - - auto zookeeper = getZooKeeper(); - - Strings exists_paths; - exists_paths.reserve(part_names.size()); - for (const String & part_name : part_names) - { - exists_paths.emplace_back(fs::path(replica_path) / "parts" / part_name); - } - - auto exists_results = zookeeper->exists(exists_paths); - - std::vector> remove_futures; - remove_futures.reserve(part_names.size()); - for (size_t i = 0; i < part_names.size(); ++i) - { - Coordination::ExistsResponse exists_resp = exists_results[i]; - if (exists_resp.error == Coordination::Error::ZOK) - { - Coordination::Requests ops; - getRemovePartFromZooKeeperOps(part_names[i], ops, exists_resp.stat.numChildren > 0); - remove_futures.emplace_back(zookeeper->asyncTryMultiNoThrow(ops)); - } - } - - for (auto & future : remove_futures) - { - auto response = future.get(); - - if (response.error == Coordination::Error::ZOK || response.error == Coordination::Error::ZNONODE) - continue; - - if (Coordination::isHardwareError(response.error)) - { - success = false; - continue; - } - - throw Coordination::Exception(response.error); - } - } - catch (Coordination::Exception & e) - { - success = false; - - if (Coordination::isHardwareError(e.code)) - tryLogCurrentException(log, __PRETTY_FUNCTION__); - else - throw; - } - - if (!success && num_tries < max_retries) - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + zookeeper = getZooKeeper(); + Strings parts_to_retry; + std::move(parts_to_retry_set.begin(), parts_to_retry_set.end(), std::back_inserter(parts_to_retry)); + parts_to_retry_set.clear(); + removePartsFromZooKeeper(zookeeper, parts_to_retry, &parts_to_retry_set); + ++num_tries; } - if (!success) - throw Exception(ErrorCodes::UNFINISHED, "Failed to remove parts from ZooKeeper after {} retries", num_tries); + if (!parts_to_retry_set.empty()) + throw Exception(ErrorCodes::UNFINISHED, "Failed to remove {} parts from ZooKeeper after {} retries", parts_to_retry_set.size(), num_tries); } void StorageReplicatedMergeTree::removePartsFromZooKeeper( zkutil::ZooKeeperPtr & zookeeper, const Strings & part_names, NameSet * parts_should_be_retried) +try { Strings exists_paths; std::vector> remove_futures; exists_paths.reserve(part_names.size()); remove_futures.reserve(part_names.size()); - try + /// Exception can be thrown from loop + /// if zk session will be dropped + for (const String & part_name : part_names) { - /// Exception can be thrown from loop - /// if zk session will be dropped - for (const String & part_name : part_names) - { - exists_paths.emplace_back(fs::path(replica_path) / "parts" / part_name); - } - - auto exists_results = zookeeper->exists(exists_paths); - - for (size_t i = 0; i < part_names.size(); ++i) - { - auto exists_resp = exists_results[i]; - if (exists_resp.error == Coordination::Error::ZOK) - { - Coordination::Requests ops; - getRemovePartFromZooKeeperOps(part_names[i], ops, exists_resp.stat.numChildren > 0); - remove_futures.emplace_back(zookeeper->asyncTryMultiNoThrow(ops)); - } - else - { - LOG_DEBUG(log, "There is no part {} in ZooKeeper, it was only in filesystem", part_names[i]); - // emplace invalid future so that the total number of futures is the same as part_names.size(); - remove_futures.emplace_back(); - } - } + exists_paths.emplace_back(fs::path(replica_path) / "parts" / part_name); } - catch (const Coordination::Exception & e) + + auto exists_results = zookeeper->exists(exists_paths); + + for (size_t i = 0; i < part_names.size(); ++i) { - if (parts_should_be_retried && Coordination::isHardwareError(e.code)) - parts_should_be_retried->insert(part_names.begin(), part_names.end()); - throw; + auto exists_resp = exists_results[i]; + if (exists_resp.error == Coordination::Error::ZOK) + { + Coordination::Requests ops; + getRemovePartFromZooKeeperOps(part_names[i], ops, exists_resp.stat.numChildren > 0); + remove_futures.emplace_back(zookeeper->asyncTryMultiNoThrow(ops)); + } + else + { + LOG_DEBUG(log, "There is no part {} in ZooKeeper, it was only in filesystem", part_names[i]); + // emplace invalid future so that the total number of futures is the same as part_names.size(); + remove_futures.emplace_back(); + } } for (size_t i = 0; i < remove_futures.size(); ++i) @@ -6884,21 +6847,27 @@ void StorageReplicatedMergeTree::removePartsFromZooKeeper( auto response = future.get(); if (response.error == Coordination::Error::ZOK) continue; - else if (response.error == Coordination::Error::ZNONODE) + + if (response.error == Coordination::Error::ZNONODE) { LOG_DEBUG(log, "There is no part {} in ZooKeeper, it was only in filesystem", part_names[i]); - continue; } - else if (Coordination::isHardwareError(response.error)) + else { if (parts_should_be_retried) parts_should_be_retried->insert(part_names[i]); - continue; + + if (!Coordination::isHardwareError(response.error)) + LOG_WARNING(log, "Cannot remove part {} from ZooKeeper: {}", part_names[i], Coordination::errorMessage(response.error)); } - else - LOG_WARNING(log, "Cannot remove part {} from ZooKeeper: {}", part_names[i], Coordination::errorMessage(response.error)); } } +catch (...) +{ + if (parts_should_be_retried) + parts_should_be_retried->insert(part_names.begin(), part_names.end()); + throw; +} void StorageReplicatedMergeTree::clearLockedBlockNumbersInPartition( zkutil::ZooKeeper & zookeeper, const String & partition_id, Int64 min_block_num, Int64 max_block_num) diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index cb93dd0b5e3..dd7ea84f76b 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -344,6 +344,7 @@ private: /// Delete old parts from disk and from ZooKeeper. void clearOldPartsAndRemoveFromZK(); + void clearOldPartsAndRemoveFromZKImpl(zkutil::ZooKeeperPtr zookeeper, DataPartsVector && parts); template friend class ReplicatedMergeTreeSinkImpl; diff --git a/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh b/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh index 5b1c50262bf..862cc90fb1c 100755 --- a/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh +++ b/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh @@ -79,6 +79,8 @@ timeout $TIMEOUT bash -c thread5 2> /dev/null & wait check_replication_consistency "alter_table" "count(), sum(a), sum(b), round(sum(c))" +$CLICKHOUSE_CLIENT -q "SELECT table, lost_part_count FROM system.replicas WHERE database=currentDatabase() AND lost_part_count!=0"; + $CLICKHOUSE_CLIENT -n -q "DROP TABLE alter_table0;" 2> >(grep -F -v 'is already started to be removing by another replica right now') & $CLICKHOUSE_CLIENT -n -q "DROP TABLE alter_table1;" 2> >(grep -F -v 'is already started to be removing by another replica right now') & wait diff --git a/tests/queries/0_stateless/01154_move_partition_long.sh b/tests/queries/0_stateless/01154_move_partition_long.sh index c68b0944407..3e068fa0e2d 100755 --- a/tests/queries/0_stateless/01154_move_partition_long.sh +++ b/tests/queries/0_stateless/01154_move_partition_long.sh @@ -125,6 +125,8 @@ wait check_replication_consistency "dst_" "count(), sum(p), sum(k), sum(v)" try_sync_replicas "src_" 300 +$CLICKHOUSE_CLIENT -q "SELECT table, lost_part_count FROM system.replicas WHERE database=currentDatabase() AND lost_part_count!=0"; + for ((i=0; i<16; i++)) do $CLICKHOUSE_CLIENT -q "DROP TABLE dst_$i" 2>&1| grep -Fv "is already started to be removing" & $CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS src_$i" 2>&1| grep -Fv "is already started to be removing" & From 38abcd1c44bc580217081d9fb3d72a1dcd951fa3 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 1 Jun 2023 19:25:53 +0000 Subject: [PATCH 0254/1072] Add nullable support to DateTimeTransformImpl --- src/Functions/DateTimeTransforms.h | 44 ++++++++++++------ src/Functions/FunctionsConversion.h | 46 ++++++++++++++++++- .../01556_accurate_cast_or_null.reference | 8 ++++ .../01556_accurate_cast_or_null.sql | 10 ++++ 4 files changed, 93 insertions(+), 15 deletions(-) diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index 81b1ec2e356..0008b36071b 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include #include @@ -1433,7 +1435,8 @@ template - static void vector(const FromTypeVector & vec_from, ToTypeVector & vec_to, const DateLUTImpl & time_zone, const Transform & transform) + static void vector(const FromTypeVector & vec_from, ToTypeVector & vec_to, const DateLUTImpl & time_zone, const Transform & transform, + ColumnUInt8::Container * vec_null_map_to [[maybe_unused]]) { using ValueType = typename ToTypeVector::value_type; size_t size = vec_from.size(); @@ -1441,28 +1444,30 @@ struct Transformer for (size_t i = 0; i < size; ++i) { - constexpr bool transformHasExtraCheck = requires(const Transform& t) + constexpr bool transformHasIsConvertible = requires(const Transform& t) { - t.ExtraCheck(vec_from[i], time_zone); + t.IsConvertible(vec_from[i], time_zone); }; - if constexpr (transformHasExtraCheck) + if constexpr (transformHasIsConvertible) { - // if constexpr (std::is_same_v - // || std::is_same_v) + if constexpr (std::is_same_v + || std::is_same_v) { - bool checked = transform.ExtraCheck(vec_from[i], time_zone); + bool checked = transform.IsConvertible(vec_from[i], time_zone); if (!checked) { - if (std::is_same_v) + if (std::is_same_v) { - // vec_to[i] = 0; - // (*vec_null_map_to)[i] = true; + vec_to[i] = 0; + if (vec_null_map_to) + (*vec_null_map_to)[i] = true; + continue; } else { throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Value in column {} cannot be safely converted into type {}", - TypeName, TypeName); + TypeName, TypeName); } } } @@ -1488,6 +1493,14 @@ struct DateTimeTransformImpl const ColumnPtr source_col = arguments[0].column; if (const auto * sources = checkAndGetColumn(source_col.get())) { + ColumnUInt8::MutablePtr col_null_map_to; + ColumnUInt8::Container * vec_null_map_to [[maybe_unused]] = nullptr; + if constexpr (std::is_same_v) + { + col_null_map_to = ColumnUInt8::create(sources->getData().size(), false); + vec_null_map_to = &col_null_map_to->getData(); + } + auto mutable_result_col = result_type->createColumn(); auto * col_to = assert_cast(mutable_result_col.get()); @@ -1495,7 +1508,7 @@ struct DateTimeTransformImpl if (result_data_type.isDateTime() || result_data_type.isDateTime64()) { const auto & time_zone = dynamic_cast(*result_type).getTimeZone(); - Op::vector(sources->getData(), col_to->getData(), time_zone, transform); + Op::vector(sources->getData(), col_to->getData(), time_zone, transform, vec_null_map_to); } else { @@ -1504,7 +1517,12 @@ struct DateTimeTransformImpl time_zone_argument_position = 2; const DateLUTImpl & time_zone = extractTimeZoneFromFunctionArguments(arguments, time_zone_argument_position, 0); - Op::vector(sources->getData(), col_to->getData(), time_zone, transform); + Op::vector(sources->getData(), col_to->getData(), time_zone, transform, vec_null_map_to); + } + + if (vec_null_map_to) + { + return ColumnNullable::create(std::move(mutable_result_col), std::move(col_null_map_to)); } return mutable_result_col; diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index d3ccbb82721..4b25a59ecc6 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -370,6 +370,11 @@ struct ToDateTransform32Or64 { static constexpr auto name = "toDate"; + static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) + { + return from >= 0; + } + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) { // since converting to Date, no need in values outside of default LUT range. @@ -384,6 +389,11 @@ struct ToDateTransform32Or64Signed { static constexpr auto name = "toDate"; + static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) + { + return from >= 0; + } + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) { // TODO: decide narrow or extended range based on FromType @@ -400,7 +410,8 @@ template struct ToDateTransform8Or16Signed { static constexpr auto name = "toDate"; - static NO_SANITIZE_UNDEFINED bool ExtraCheck(const FromType & from, const DateLUTImpl &) + + static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) { return from >= 0; } @@ -423,6 +434,11 @@ struct ToDate32Transform32Or64 { static constexpr auto name = "toDate32"; + static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) + { + return from >= 0; + } + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) { return (from < DATE_LUT_MAX_EXTEND_DAY_NUM) @@ -436,6 +452,11 @@ struct ToDate32Transform32Or64Signed { static constexpr auto name = "toDate32"; + static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) + { + return from >= 0; + } + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) { static const Int32 daynum_min_offset = -static_cast(DateLUT::instance().getDayNumOffsetEpoch()); @@ -452,6 +473,11 @@ struct ToDate32Transform8Or16Signed { static constexpr auto name = "toDate32"; + static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) + { + return from >= 0; + } + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) { return from; @@ -507,6 +533,11 @@ struct ToDateTimeTransform64 { static constexpr auto name = "toDateTime"; + static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) + { + return from >= 0; + } + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) { return static_cast(std::min(time_t(from), time_t(0xFFFFFFFF))); @@ -518,6 +549,11 @@ struct ToDateTimeTransformSigned { static constexpr auto name = "toDateTime"; + static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) + { + return from >= 0; + } + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) { if (from < 0) @@ -531,6 +567,11 @@ struct ToDateTimeTransform64Signed { static constexpr auto name = "toDateTime"; + static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) + { + return from >= 0; + } + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) { if (from < 0) @@ -2886,7 +2927,8 @@ private: return true; } - if constexpr (IsDataTypeNumber && IsDataTypeDateOrDateTime) + if constexpr (IsDataTypeNumber + && (std::is_same_v || std::is_same_v)) { if (wrapper_cast_type == CastType::accurate) { diff --git a/tests/queries/0_stateless/01556_accurate_cast_or_null.reference b/tests/queries/0_stateless/01556_accurate_cast_or_null.reference index b329aede01a..8429d5d0e64 100644 --- a/tests/queries/0_stateless/01556_accurate_cast_or_null.reference +++ b/tests/queries/0_stateless/01556_accurate_cast_or_null.reference @@ -30,3 +30,11 @@ \N 127 \N +\N +\N +2023-05-30 14:38:20 +1970-01-01 00:00:19 +\N +\N +2023-05-30 +1970-01-20 \ No newline at end of file diff --git a/tests/queries/0_stateless/01556_accurate_cast_or_null.sql b/tests/queries/0_stateless/01556_accurate_cast_or_null.sql index b45bbe35662..a9038a1d230 100644 --- a/tests/queries/0_stateless/01556_accurate_cast_or_null.sql +++ b/tests/queries/0_stateless/01556_accurate_cast_or_null.sql @@ -35,3 +35,13 @@ SELECT accurateCastOrNull(nan, 'UInt64'); SELECT accurateCastOrNull(nan, 'UInt256'); SELECT accurateCastOrNull(number + 127, 'Int8') AS x FROM numbers (2) ORDER BY x; + +SELECT accurateCastOrNull(-1, 'DateTime'); +SELECT accurateCastOrNull('1xxx', 'DateTime'); +SELECT accurateCastOrNull('2023-05-30 14:38:20', 'DateTime'); +SELECT accurateCastOrNull(19, 'DateTime'); + +SELECT accurateCastOrNull(-1, 'Date'); +SELECT accurateCastOrNull('1xxx', 'Date'); +SELECT accurateCastOrNull('2023-05-30', 'Date'); +SELECT accurateCastOrNull(19, 'Date'); From 11ead24bf9d0426e15a94d87746430f16035da20 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 1 Jun 2023 19:38:46 +0000 Subject: [PATCH 0255/1072] Fix tests after nullable fixing --- .../queries/0_stateless/01556_accurate_cast_or_null.reference | 2 +- tests/queries/0_stateless/01601_accurate_cast.reference | 2 +- tests/queries/0_stateless/01601_accurate_cast.sql | 2 +- .../0_stateless/01746_convert_type_with_default.reference | 3 ++- tests/queries/0_stateless/01746_convert_type_with_default.sql | 3 ++- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/queries/0_stateless/01556_accurate_cast_or_null.reference b/tests/queries/0_stateless/01556_accurate_cast_or_null.reference index 8429d5d0e64..cbdf72e9910 100644 --- a/tests/queries/0_stateless/01556_accurate_cast_or_null.reference +++ b/tests/queries/0_stateless/01556_accurate_cast_or_null.reference @@ -37,4 +37,4 @@ \N \N 2023-05-30 -1970-01-20 \ No newline at end of file +1970-01-20 diff --git a/tests/queries/0_stateless/01601_accurate_cast.reference b/tests/queries/0_stateless/01601_accurate_cast.reference index b662319d263..3c6dceb1f16 100644 --- a/tests/queries/0_stateless/01601_accurate_cast.reference +++ b/tests/queries/0_stateless/01601_accurate_cast.reference @@ -9,4 +9,4 @@ 2023-05-30 14:38:20 1970-01-01 00:00:19 2023-05-30 -1970-01-20 \ No newline at end of file +1970-01-20 diff --git a/tests/queries/0_stateless/01601_accurate_cast.sql b/tests/queries/0_stateless/01601_accurate_cast.sql index 1ab98e26d1a..7611b1d96b9 100644 --- a/tests/queries/0_stateless/01601_accurate_cast.sql +++ b/tests/queries/0_stateless/01601_accurate_cast.sql @@ -29,6 +29,6 @@ SELECT accurateCast('2023-05-30 14:38:20', 'DateTime'); SELECT accurateCast(19, 'DateTime'); SELECT accurateCast(-1, 'Date'); -- { serverError 70 } -SELECT accurateCast('1xxx', 'Date'); -- { serverError 70 } +SELECT accurateCast('1xxx', 'Date'); -- { serverError 38 } SELECT accurateCast('2023-05-30', 'Date'); SELECT accurateCast(19, 'Date'); diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.reference b/tests/queries/0_stateless/01746_convert_type_with_default.reference index 892a12434b9..85bf2064fdc 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.reference +++ b/tests/queries/0_stateless/01746_convert_type_with_default.reference @@ -40,7 +40,8 @@ 1970-01-20 1970-01-20 2023-05-30 -1970-01-01 +2023-05-30 +2023-05-30 14:38:20 2023-05-30 14:38:20 2023-05-30 14:38:20 2023-05-30 14:38:20 diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.sql b/tests/queries/0_stateless/01746_convert_type_with_default.sql index 75e1510f330..1065eefa94d 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.sql +++ b/tests/queries/0_stateless/01746_convert_type_with_default.sql @@ -54,12 +54,13 @@ select toDateOrDefault(cast(19 as Int256)); select toDateOrDefault(cast(19 as UInt256)); select toDateOrDefault(19507, '2000-01-01'::Date); -select toDateOrDefault(-1, '2000-01-01'::Date); +select toDateOrDefault(-1, '2023-05-30'::Date); select toDateTimeOrDefault('2023-05-30 14:38:20'); select toDateTimeOrDefault('2023-05-30 14:38:20', 'UTC'); select toDateTimeOrDefault('1xxx', 'UTC', '2023-05-30 14:38:20'::DateTime('UTC')); select toDateTimeOrDefault(1685457500, 'UTC'); +select toDateTimeOrDefault(-1, 'UTC', '2023-05-30 14:38:20'::DateTime('UTC')); select toDateTimeOrDefault(cast(19 as Int8), 'UTC'); select toDateTimeOrDefault(cast(19 as UInt8), 'UTC'); From a22e80eed56ca0e6ab35ba0ad8c53c0a629a4839 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 1 Jun 2023 19:52:48 +0000 Subject: [PATCH 0256/1072] Remove whitespaces --- src/Functions/DateTimeTransforms.h | 6 +++--- src/Functions/FunctionsConversion.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index 0008b36071b..9f8f4df2465 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -1448,7 +1448,7 @@ struct Transformer { t.IsConvertible(vec_from[i], time_zone); }; - + if constexpr (transformHasIsConvertible) { if constexpr (std::is_same_v @@ -1472,7 +1472,7 @@ struct Transformer } } } - + if constexpr (is_extended_result) vec_to[i] = static_cast(transform.executeExtendedResult(vec_from[i], time_zone)); else @@ -1500,7 +1500,7 @@ struct DateTimeTransformImpl col_null_map_to = ColumnUInt8::create(sources->getData().size(), false); vec_null_map_to = &col_null_map_to->getData(); } - + auto mutable_result_col = result_type->createColumn(); auto * col_to = assert_cast(mutable_result_col.get()); diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 4b25a59ecc6..d77090afe71 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -415,7 +415,7 @@ struct ToDateTransform8Or16Signed { return from >= 0; } - + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) { if (from < 0) From 1544067fb37b53b5ba0e1101db9ab068e9903217 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 1 Jun 2023 23:28:19 +0300 Subject: [PATCH 0257/1072] Update run.sh --- docker/test/unit/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/unit/run.sh b/docker/test/unit/run.sh index abc35fa40d2..a4784466e27 100644 --- a/docker/test/unit/run.sh +++ b/docker/test/unit/run.sh @@ -3,5 +3,5 @@ set -x service zookeeper start && sleep 7 && /usr/share/zookeeper/bin/zkCli.sh -server localhost:2181 -create create /clickhouse_test ''; -gdb -q -ex 'set print inferior-events off' -ex 'set confirm off' -ex 'set print thread-events off' -ex run -ex bt -ex quit --args ./unit_tests_dbms | tee test_output/test_result.txt +timeout 40m gdb -q -ex 'set print inferior-events off' -ex 'set confirm off' -ex 'set print thread-events off' -ex run -ex bt -ex quit --args ./unit_tests_dbms | tee test_output/test_result.txt ./process_unit_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv From 6b2c906dfd57a895b93605a4bfb07fda7a72945d Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Thu, 1 Jun 2023 19:03:35 -0400 Subject: [PATCH 0258/1072] add docs for boundingRatio --- .../aggregate-functions/reference/boundrat.md | 44 ++++++ .../aggregate-functions/reference/index.md | 131 +++++++++--------- 2 files changed, 110 insertions(+), 65 deletions(-) create mode 100644 docs/en/sql-reference/aggregate-functions/reference/boundrat.md diff --git a/docs/en/sql-reference/aggregate-functions/reference/boundrat.md b/docs/en/sql-reference/aggregate-functions/reference/boundrat.md new file mode 100644 index 00000000000..f3907af8030 --- /dev/null +++ b/docs/en/sql-reference/aggregate-functions/reference/boundrat.md @@ -0,0 +1,44 @@ +--- +slug: /en/sql-reference/aggregate-functions/reference/boundingRatio +sidebar_position: 2 +title: boundingRatio +--- + +Aggregate function that calculates the slope between the leftmost and rightmost points across a group of values. + +Example: + +Sample data: +```sql +SELECT + number, + number * 1.5 +FROM numbers(10) +``` +```response +┌─number─┬─multiply(number, 1.5)─┐ +│ 0 │ 0 │ +│ 1 │ 1.5 │ +│ 2 │ 3 │ +│ 3 │ 4.5 │ +│ 4 │ 6 │ +│ 5 │ 7.5 │ +│ 6 │ 9 │ +│ 7 │ 10.5 │ +│ 8 │ 12 │ +│ 9 │ 13.5 │ +└────────┴───────────────────────┘ +``` + +The boundingRatio() function returns the slope of the line between the leftmost and rightmost points, in the above data these points are `(0,0)` and `(9,13.5)`. + +```sql +SELECT boundingRatio(number, number * 1.5) +FROM numbers(10) +``` +```response +┌─boundingRatio(number, multiply(number, 1.5))─┐ +│ 1.5 │ +└──────────────────────────────────────────────┘ +``` + diff --git a/docs/en/sql-reference/aggregate-functions/reference/index.md b/docs/en/sql-reference/aggregate-functions/reference/index.md index 50208352f38..17ef494e9ad 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/index.md +++ b/docs/en/sql-reference/aggregate-functions/reference/index.md @@ -9,74 +9,75 @@ toc_hidden: true Standard aggregate functions: -- [count](../../../sql-reference/aggregate-functions/reference/count.md) -- [min](../../../sql-reference/aggregate-functions/reference/min.md) -- [max](../../../sql-reference/aggregate-functions/reference/max.md) -- [sum](../../../sql-reference/aggregate-functions/reference/sum.md) -- [avg](../../../sql-reference/aggregate-functions/reference/avg.md) -- [any](../../../sql-reference/aggregate-functions/reference/any.md) -- [stddevPop](../../../sql-reference/aggregate-functions/reference/stddevpop.md) -- [stddevSamp](../../../sql-reference/aggregate-functions/reference/stddevsamp.md) -- [varPop](../../../sql-reference/aggregate-functions/reference/varpop.md) -- [varSamp](../../../sql-reference/aggregate-functions/reference/varsamp.md) -- [covarPop](../../../sql-reference/aggregate-functions/reference/covarpop.md) -- [covarSamp](../../../sql-reference/aggregate-functions/reference/covarsamp.md) +- [count](/docs/en/sql-reference/aggregate-functions/reference/count.md) +- [min](/docs/en/sql-reference/aggregate-functions/reference/min.md) +- [max](/docs/en/sql-reference/aggregate-functions/reference/max.md) +- [sum](/docs/en/sql-reference/aggregate-functions/reference/sum.md) +- [avg](/docs/en/sql-reference/aggregate-functions/reference/avg.md) +- [any](/docs/en/sql-reference/aggregate-functions/reference/any.md) +- [stddevPop](/docs/en/sql-reference/aggregate-functions/reference/stddevpop.md) +- [stddevSamp](/docs/en/sql-reference/aggregate-functions/reference/stddevsamp.md) +- [varPop](/docs/en/sql-reference/aggregate-functions/reference/varpop.md) +- [varSamp](/docs/en/sql-reference/aggregate-functions/reference/varsamp.md) +- [covarPop](/docs/en/sql-reference/aggregate-functions/reference/covarpop.md) +- [covarSamp](/docs/en/sql-reference/aggregate-functions/reference/covarsamp.md) ClickHouse-specific aggregate functions: -- [anyHeavy](../../../sql-reference/aggregate-functions/reference/anyheavy.md) -- [anyLast](../../../sql-reference/aggregate-functions/reference/anylast.md) -- [first_value](../../../sql-reference/aggregate-functions/reference/first_value.md) -- [last_value](../../../sql-reference/aggregate-functions/reference/last_value.md) -- [argMin](../../../sql-reference/aggregate-functions/reference/argmin.md) -- [argMax](../../../sql-reference/aggregate-functions/reference/argmax.md) -- [avgWeighted](../../../sql-reference/aggregate-functions/reference/avgweighted.md) -- [topK](../../../sql-reference/aggregate-functions/reference/topk.md) -- [topKWeighted](../../../sql-reference/aggregate-functions/reference/topkweighted.md) -- [groupArray](../../../sql-reference/aggregate-functions/reference/grouparray.md) -- [groupArrayLast](../../../sql-reference/aggregate-functions/reference/grouparraylast.md) -- [groupUniqArray](../../../sql-reference/aggregate-functions/reference/groupuniqarray.md) -- [groupArrayInsertAt](../../../sql-reference/aggregate-functions/reference/grouparrayinsertat.md) -- [groupArrayMovingAvg](../../../sql-reference/aggregate-functions/reference/grouparraymovingavg.md) -- [groupArrayMovingSum](../../../sql-reference/aggregate-functions/reference/grouparraymovingsum.md) -- [groupBitAnd](../../../sql-reference/aggregate-functions/reference/groupbitand.md) -- [groupBitOr](../../../sql-reference/aggregate-functions/reference/groupbitor.md) -- [groupBitXor](../../../sql-reference/aggregate-functions/reference/groupbitxor.md) -- [groupBitmap](../../../sql-reference/aggregate-functions/reference/groupbitmap.md) -- [groupBitmapAnd](../../../sql-reference/aggregate-functions/reference/groupbitmapand.md) -- [groupBitmapOr](../../../sql-reference/aggregate-functions/reference/groupbitmapor.md) -- [groupBitmapXor](../../../sql-reference/aggregate-functions/reference/groupbitmapxor.md) -- [sumWithOverflow](../../../sql-reference/aggregate-functions/reference/sumwithoverflow.md) -- [sumMap](../../../sql-reference/aggregate-functions/reference/summap.md) -- [minMap](../../../sql-reference/aggregate-functions/reference/minmap.md) -- [maxMap](../../../sql-reference/aggregate-functions/reference/maxmap.md) -- [skewSamp](../../../sql-reference/aggregate-functions/reference/skewsamp.md) -- [skewPop](../../../sql-reference/aggregate-functions/reference/skewpop.md) -- [kurtSamp](../../../sql-reference/aggregate-functions/reference/kurtsamp.md) -- [kurtPop](../../../sql-reference/aggregate-functions/reference/kurtpop.md) -- [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md) -- [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md) -- [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md) -- [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md) -- [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md) -- [uniqTheta](../../../sql-reference/aggregate-functions/reference/uniqthetasketch.md) -- [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md) -- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md) -- [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md) -- [quantileExactLow](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexactlow) -- [quantileExactHigh](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexacthigh) -- [quantileExactWeighted](../../../sql-reference/aggregate-functions/reference/quantileexactweighted.md) -- [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md) -- [quantileTimingWeighted](../../../sql-reference/aggregate-functions/reference/quantiletimingweighted.md) -- [quantileDeterministic](../../../sql-reference/aggregate-functions/reference/quantiledeterministic.md) -- [quantileTDigest](../../../sql-reference/aggregate-functions/reference/quantiletdigest.md) -- [quantileTDigestWeighted](../../../sql-reference/aggregate-functions/reference/quantiletdigestweighted.md) -- [quantileBFloat16](../../../sql-reference/aggregate-functions/reference/quantilebfloat16.md#quantilebfloat16) -- [quantileBFloat16Weighted](../../../sql-reference/aggregate-functions/reference/quantilebfloat16.md#quantilebfloat16weighted) -- [simpleLinearRegression](../../../sql-reference/aggregate-functions/reference/simplelinearregression.md) -- [stochasticLinearRegression](../../../sql-reference/aggregate-functions/reference/stochasticlinearregression.md) -- [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md) -- [categoricalInformationValue](../../../sql-reference/aggregate-functions/reference/categoricalinformationvalue.md) +- [anyHeavy](/docs/en/sql-reference/aggregate-functions/reference/anyheavy.md) +- [anyLast](/docs/en/sql-reference/aggregate-functions/reference/anylast.md) +- [boundingRatio](/docs/en/sql-reference/aggregate-functions/reference/boundrat.md) +- [first_value](/docs/en/sql-reference/aggregate-functions/reference/first_value.md) +- [last_value](/docs/en/sql-reference/aggregate-functions/reference/last_value.md) +- [argMin](/docs/en/sql-reference/aggregate-functions/reference/argmin.md) +- [argMax](/docs/en/sql-reference/aggregate-functions/reference/argmax.md) +- [avgWeighted](/docs/en/sql-reference/aggregate-functions/reference/avgweighted.md) +- [topK](/docs/en/sql-reference/aggregate-functions/reference/topk.md) +- [topKWeighted](/docs/en/sql-reference/aggregate-functions/reference/topkweighted.md) +- [groupArray](/docs/en/sql-reference/aggregate-functions/reference/grouparray.md) +- [groupArrayLast](/docs/en/sql-reference/aggregate-functions/reference/grouparraylast.md) +- [groupUniqArray](/docs/en/sql-reference/aggregate-functions/reference/groupuniqarray.md) +- [groupArrayInsertAt](/docs/en/sql-reference/aggregate-functions/reference/grouparrayinsertat.md) +- [groupArrayMovingAvg](/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingavg.md) +- [groupArrayMovingSum](/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingsum.md) +- [groupBitAnd](/docs/en/sql-reference/aggregate-functions/reference/groupbitand.md) +- [groupBitOr](/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md) +- [groupBitXor](/docs/en/sql-reference/aggregate-functions/reference/groupbitxor.md) +- [groupBitmap](/docs/en/sql-reference/aggregate-functions/reference/groupbitmap.md) +- [groupBitmapAnd](/docs/en/sql-reference/aggregate-functions/reference/groupbitmapand.md) +- [groupBitmapOr](/docs/en/sql-reference/aggregate-functions/reference/groupbitmapor.md) +- [groupBitmapXor](/docs/en/sql-reference/aggregate-functions/reference/groupbitmapxor.md) +- [sumWithOverflow](/docs/en/sql-reference/aggregate-functions/reference/sumwithoverflow.md) +- [sumMap](/docs/en/sql-reference/aggregate-functions/reference/summap.md) +- [minMap](/docs/en/sql-reference/aggregate-functions/reference/minmap.md) +- [maxMap](/docs/en/sql-reference/aggregate-functions/reference/maxmap.md) +- [skewSamp](/docs/en/sql-reference/aggregate-functions/reference/skewsamp.md) +- [skewPop](/docs/en/sql-reference/aggregate-functions/reference/skewpop.md) +- [kurtSamp](/docs/en/sql-reference/aggregate-functions/reference/kurtsamp.md) +- [kurtPop](/docs/en/sql-reference/aggregate-functions/reference/kurtpop.md) +- [uniq](/docs/en/sql-reference/aggregate-functions/reference/uniq.md) +- [uniqExact](/docs/en/sql-reference/aggregate-functions/reference/uniqexact.md) +- [uniqCombined](/docs/en/sql-reference/aggregate-functions/reference/uniqcombined.md) +- [uniqCombined64](/docs/en/sql-reference/aggregate-functions/reference/uniqcombined64.md) +- [uniqHLL12](/docs/en/sql-reference/aggregate-functions/reference/uniqhll12.md) +- [uniqTheta](/docs/en/sql-reference/aggregate-functions/reference/uniqthetasketch.md) +- [quantile](/docs/en/sql-reference/aggregate-functions/reference/quantile.md) +- [quantiles](/docs/en/sql-reference/aggregate-functions/reference/quantiles.md) +- [quantileExact](/docs/en/sql-reference/aggregate-functions/reference/quantileexact.md) +- [quantileExactLow](/docs/en/sql-reference/aggregate-functions/reference/quantileexact.md#quantileexactlow) +- [quantileExactHigh](/docs/en/sql-reference/aggregate-functions/reference/quantileexact.md#quantileexacthigh) +- [quantileExactWeighted](/docs/en/sql-reference/aggregate-functions/reference/quantileexactweighted.md) +- [quantileTiming](/docs/en/sql-reference/aggregate-functions/reference/quantiletiming.md) +- [quantileTimingWeighted](/docs/en/sql-reference/aggregate-functions/reference/quantiletimingweighted.md) +- [quantileDeterministic](/docs/en/sql-reference/aggregate-functions/reference/quantiledeterministic.md) +- [quantileTDigest](/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md) +- [quantileTDigestWeighted](/docs/en/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md) +- [quantileBFloat16](/docs/en/sql-reference/aggregate-functions/reference/quantilebfloat16.md#quantilebfloat16) +- [quantileBFloat16Weighted](/docs/en/sql-reference/aggregate-functions/reference/quantilebfloat16.md#quantilebfloat16weighted) +- [simpleLinearRegression](/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md) +- [stochasticLinearRegression](/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md) +- [stochasticLogisticRegression](/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md) +- [categoricalInformationValue](/docs/en/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md) - [contingency](./contingency.md) - [cramersV](./cramersv.md) - [cramersVBiasCorrected](./cramersvbiascorrected.md) From e7868e576c71de07d7dd4d921382d4cd549d6493 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 2 Jun 2023 00:50:14 +0000 Subject: [PATCH 0259/1072] Don't crash if config doesn't have logger section --- src/Loggers/Loggers.cpp | 2 +- src/Loggers/Loggers.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Loggers/Loggers.cpp b/src/Loggers/Loggers.cpp index 645ae5dcc7a..0c3a7bd615d 100644 --- a/src/Loggers/Loggers.cpp +++ b/src/Loggers/Loggers.cpp @@ -51,7 +51,7 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log #endif auto current_logger = config.getString("logger", ""); - if (config_logger == current_logger) + if (config_logger.has_value() && *config_logger == current_logger) return; config_logger = current_logger; diff --git a/src/Loggers/Loggers.h b/src/Loggers/Loggers.h index 31a215aa9ce..ebc10954b94 100644 --- a/src/Loggers/Loggers.h +++ b/src/Loggers/Loggers.h @@ -39,7 +39,7 @@ private: Poco::AutoPtr syslog_channel; /// Previous value of logger element in config. It is used to reinitialize loggers whenever the value changed. - std::string config_logger; + std::optional config_logger; #ifndef WITHOUT_TEXT_LOG std::weak_ptr text_log; From c5165c2236bfceed4ec9246d6964fd2f77f68a07 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 2 Jun 2023 03:25:12 +0200 Subject: [PATCH 0260/1072] More thorough check for the availability of TaskStats --- src/Common/TaskStatsInfoGetter.cpp | 5 ++++- src/Common/ThreadStatus.h | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Common/TaskStatsInfoGetter.cpp b/src/Common/TaskStatsInfoGetter.cpp index 25030ee9670..d21229609dd 100644 --- a/src/Common/TaskStatsInfoGetter.cpp +++ b/src/Common/TaskStatsInfoGetter.cpp @@ -9,6 +9,7 @@ #include "hasLinuxCapability.h" #include +#include #include #include @@ -202,10 +203,12 @@ bool checkPermissionsImpl() /// Check that we can successfully initialize TaskStatsInfoGetter. /// It will ask about family id through Netlink. /// On some LXC containers we have capability but we still cannot use Netlink. + /// There is an evidence that Linux fedora-riscv 6.1.22 gives something strange instead of the expected result. try { - TaskStatsInfoGetter(); + ::taskstats stats{}; + TaskStatsInfoGetter().getStat(stats, static_cast(getThreadId())); } catch (const Exception & e) { diff --git a/src/Common/ThreadStatus.h b/src/Common/ThreadStatus.h index 600dfc56d2b..061959d9f1f 100644 --- a/src/Common/ThreadStatus.h +++ b/src/Common/ThreadStatus.h @@ -37,7 +37,6 @@ class QueryThreadLog; class TasksStatsCounters; struct RUsageCounters; struct PerfEventsCounters; -class TaskStatsInfoGetter; class InternalTextLogsQueue; struct ViewRuntimeData; class QueryViewsLog; From b51064a5081ed9339e30650ccd6466262b258522 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 5 May 2023 03:11:51 +0000 Subject: [PATCH 0261/1072] Get rid of SeekableReadBufferFactory, add SeekableReadBuffer::readBigAt() instead --- src/Core/Settings.h | 1 - src/Formats/FormatFactory.cpp | 102 ++----- src/Formats/FormatFactory.h | 56 +--- src/Formats/FormatSettings.h | 2 - src/IO/MMapReadBufferFromFileDescriptor.cpp | 11 + src/IO/MMapReadBufferFromFileDescriptor.h | 3 + src/IO/ParallelReadBuffer.cpp | 258 ++++++++---------- src/IO/ParallelReadBuffer.h | 45 +-- src/IO/ReadBufferFromFileDescriptor.cpp | 48 +++- src/IO/ReadBufferFromFileDescriptor.h | 11 + src/IO/ReadBufferFromS3.cpp | 187 +++++++------ src/IO/ReadBufferFromS3.h | 49 +--- src/IO/ReadWriteBufferFromHTTP.cpp | 209 +++++++------- src/IO/ReadWriteBufferFromHTTP.h | 61 +---- src/IO/SeekableReadBuffer.cpp | 46 ++++ src/IO/SeekableReadBuffer.h | 52 ++-- src/IO/WithFileName.cpp | 2 +- src/IO/WithFileSize.cpp | 8 - .../Formats/Impl/ArrowBufferedStreams.cpp | 98 ++----- .../Formats/Impl/ArrowBufferedStreams.h | 19 +- .../Formats/Impl/ParquetBlockInputFormat.cpp | 24 +- .../Formats/Impl/ParquetBlockInputFormat.h | 6 +- src/Storages/StorageS3.cpp | 43 +-- src/Storages/StorageS3.h | 8 +- src/Storages/StorageURL.cpp | 78 +++--- src/Storages/StorageURL.h | 3 +- .../test_redirect_url_storage/test.py | 4 +- 27 files changed, 658 insertions(+), 776 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 566c9f32484..2747094451d 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -832,7 +832,6 @@ class IColumn; M(Bool, input_format_orc_case_insensitive_column_matching, false, "Ignore case when matching ORC columns with CH columns.", 0) \ M(Bool, input_format_parquet_import_nested, false, "Allow to insert array of structs into Nested table in Parquet input format.", 0) \ M(Bool, input_format_parquet_case_insensitive_column_matching, false, "Ignore case when matching Parquet columns with CH columns.", 0) \ - /* TODO: Consider unifying this with https://github.com/ClickHouse/ClickHouse/issues/38755 */ \ M(Bool, input_format_parquet_preserve_order, false, "Avoid reordering rows when reading from Parquet files. Usually makes it much slower.", 0) \ M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \ M(Bool, input_format_orc_allow_missing_columns, false, "Allow missing columns while reading ORC input formats", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index dd6252b96f1..586e1bb7251 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -221,68 +221,16 @@ template FormatSettings getFormatSettings(ContextPtr context, const Se InputFormatPtr FormatFactory::getInput( const String & name, - ReadBuffer & buf, + ReadBuffer & _buf, const Block & sample, ContextPtr context, UInt64 max_block_size, - const std::optional & format_settings, - std::optional max_parsing_threads) const -{ - return getInputImpl( - name, - nullptr, - &buf, - sample, - context, - max_block_size, - /* is_remote_fs */ false, - CompressionMethod::None, - format_settings, - /* max_download_threads */ 1, - max_parsing_threads); -} - -InputFormatPtr FormatFactory::getInputRandomAccess( - const String & name, - SeekableReadBufferFactoryPtr buf_factory, - const Block & sample, - ContextPtr context, - UInt64 max_block_size, - bool is_remote_fs, - CompressionMethod compression, - const std::optional & format_settings, - std::optional max_download_threads, - std::optional max_parsing_threads) const -{ - return getInputImpl( - name, - std::move(buf_factory), - nullptr, - sample, - context, - max_block_size, - is_remote_fs, - compression, - format_settings, - max_download_threads, - max_parsing_threads); -} - -InputFormatPtr FormatFactory::getInputImpl( - const String & name, - // exactly one of the following two is nullptr - SeekableReadBufferFactoryPtr buf_factory, - ReadBuffer * _buf, - const Block & sample, - ContextPtr context, - UInt64 max_block_size, - bool is_remote_fs, - CompressionMethod compression, const std::optional & _format_settings, + std::optional _max_parsing_threads, std::optional _max_download_threads, - std::optional _max_parsing_threads) const + bool is_remote_fs, + CompressionMethod compression) const { - chassert((!_buf) != (!buf_factory)); const auto& creators = getCreators(name); if (!creators.input_creator && !creators.random_access_input_creator) throw Exception(ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT, "Format {} is not suitable for input", name); @@ -302,14 +250,12 @@ InputFormatPtr FormatFactory::getInputImpl( if (context->hasQueryContext() && settings.log_queries) context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Format, name); - // Prepare a read buffer. + // Add ParallelReadBuffer and decompression if needed. - std::unique_ptr owned_buf; - if (buf_factory) - owned_buf = prepareReadBuffer(buf_factory, compression, creators, format_settings, settings, max_download_threads); - auto * buf = owned_buf ? owned_buf.get() : _buf; + auto owned_buf = wrapReadBufferIfNeeded(_buf, compression, creators, format_settings, settings, is_remote_fs, max_download_threads); + auto & buf = owned_buf ? *owned_buf : _buf; - // Decide whether to use parallel ParallelParsingInputFormat. + // Decide whether to use ParallelParsingInputFormat. bool parallel_parsing = max_parsing_threads > 1 && settings.input_format_parallel_parsing && creators.file_segmentation_engine && !creators.random_access_input_creator; @@ -322,7 +268,7 @@ InputFormatPtr FormatFactory::getInputImpl( { const auto & non_trivial_prefix_and_suffix_checker = creators.non_trivial_prefix_and_suffix_checker; /// Disable parallel parsing for input formats with non-trivial readPrefix() and readSuffix(). - if (non_trivial_prefix_and_suffix_checker && non_trivial_prefix_and_suffix_checker(*buf)) + if (non_trivial_prefix_and_suffix_checker && non_trivial_prefix_and_suffix_checker(buf)) parallel_parsing = false; } @@ -340,7 +286,7 @@ InputFormatPtr FormatFactory::getInputImpl( { return input_getter(input, sample, row_input_format_params, format_settings); }; ParallelParsingInputFormat::Params params{ - *buf, sample, parser_creator, creators.file_segmentation_engine, name, max_parsing_threads, + buf, sample, parser_creator, creators.file_segmentation_engine, name, max_parsing_threads, settings.min_chunk_bytes_for_parallel_parsing, max_block_size, context->getApplicationType() == Context::ApplicationType::SERVER}; format = std::make_shared(params); @@ -349,7 +295,6 @@ InputFormatPtr FormatFactory::getInputImpl( { format = creators.random_access_input_creator( buf, - std::move(buf_factory), sample, format_settings, context->getReadSettings(), @@ -359,7 +304,7 @@ InputFormatPtr FormatFactory::getInputImpl( } else { - format = creators.input_creator(*buf, sample, row_input_format_params, format_settings); + format = creators.input_creator(buf, sample, row_input_format_params, format_settings); } if (owned_buf) @@ -375,26 +320,28 @@ InputFormatPtr FormatFactory::getInputImpl( return format; } -std::unique_ptr FormatFactory::prepareReadBuffer( - SeekableReadBufferFactoryPtr & buf_factory, +std::unique_ptr FormatFactory::wrapReadBufferIfNeeded( + ReadBuffer & buf, CompressionMethod compression, const Creators & creators, const FormatSettings & format_settings, const Settings & settings, + bool is_remote_fs, size_t max_download_threads) const { std::unique_ptr res; - bool parallel_read = max_download_threads > 1 && buf_factory && format_settings.seekable_read; + bool parallel_read = is_remote_fs && max_download_threads > 1 && format_settings.seekable_read && isBufferWithFileSize(buf); if (creators.random_access_input_creator) parallel_read &= compression != CompressionMethod::None; + size_t file_size = 0; if (parallel_read) { try { - parallel_read = buf_factory->checkIfActuallySeekable() - && buf_factory->getFileSize() >= 2 * settings.max_download_buffer_size; + file_size = getFileSizeFromReadBuffer(buf); + parallel_read = file_size >= 2 * settings.max_download_buffer_size; } catch (const Poco::Exception & e) { @@ -415,23 +362,18 @@ std::unique_ptr FormatFactory::prepareReadBuffer( max_download_threads, settings.max_download_buffer_size); - res = std::make_unique( - std::move(buf_factory), - threadPoolCallbackRunner(IOThreadPool::get(), "ParallelRead"), - max_download_threads, - settings.max_download_buffer_size); + res = wrapInParallelReadBufferIfSupported( + buf, threadPoolCallbackRunner(IOThreadPool::get(), "ParallelRead"), + max_download_threads, settings.max_download_buffer_size, file_size); } if (compression != CompressionMethod::None) { if (!res) - res = buf_factory->getReader(); // NOLINT + res = wrapReadBufferReference(buf); res = wrapReadBufferWithCompressionMethod(std::move(res), compression, static_cast(settings.zstd_window_log_max)); } - if (!creators.random_access_input_creator && !res) - res = buf_factory->getReader(); - return res; } diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h index 677e34845d8..1d258beca8d 100644 --- a/src/Formats/FormatFactory.h +++ b/src/Formats/FormatFactory.h @@ -90,15 +90,11 @@ private: const FormatSettings & settings)>; // Incompatible with FileSegmentationEngine. - // When created using SeekableReadBufferFactoryPtr, the IInputFormat doesn't support - // resetParser() and setReadBuffer(). // // In future we may also want to pass some information about WHERE conditions (SelectQueryInfo?) // and get some information about projections (min/max/count per column per row group). using RandomAccessInputCreator = std::function & format_settings = std::nullopt, - std::optional max_parsing_threads = std::nullopt) const; - - // Format parser from a random-access source (factory of seekable read buffers). - // Parallelizes both parsing and reading when possible. - // Prefer this over getInput() when reading from random-access source like file or HTTP. - InputFormatPtr getInputRandomAccess( - const String & name, - SeekableReadBufferFactoryPtr buf_factory, - const Block & sample, - ContextPtr context, - UInt64 max_block_size, - bool is_remote_fs, - CompressionMethod compression, - // if nullopt, getFormatSettings(context) is used - const std::optional & format_settings = std::nullopt, + std::optional max_parsing_threads = std::nullopt, std::optional max_download_threads = std::nullopt, - std::optional max_parsing_threads = std::nullopt) const; + // affects things like buffer sizes and parallel reading + bool is_remote_fs = false, + // allows to do: buf -> parallel read -> decompression, + // because parallel read after decompression is not possible + CompressionMethod compression = CompressionMethod::None) const; /// Checks all preconditions. Returns ordinary format if parallel formatting cannot be done. OutputFormatPtr getOutputFormatParallelIfPossible( @@ -272,28 +260,14 @@ private: const Creators & getCreators(const String & name) const; - InputFormatPtr getInputImpl( - const String & name, - // exactly one of the following two is nullptr - SeekableReadBufferFactoryPtr buf_factory, - ReadBuffer * buf, - const Block & sample, - ContextPtr context, - UInt64 max_block_size, - bool is_remote_fs, - CompressionMethod compression, - const std::optional & format_settings, - std::optional max_download_threads, - std::optional max_parsing_threads) const; - - // Creates a ReadBuffer to give to an input format. - // Returns nullptr if we should give it the whole factory. - std::unique_ptr prepareReadBuffer( - SeekableReadBufferFactoryPtr & buf_factory, + // Creates a ReadBuffer to give to an input format. Returns nullptr if we should use `buf` directly. + std::unique_ptr wrapReadBufferIfNeeded( + ReadBuffer & buf, CompressionMethod compression, const Creators & creators, const FormatSettings & format_settings, const Settings & settings, + bool is_remote_fs, size_t max_download_threads) const; }; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index c88af650671..e332bd749a1 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -214,8 +214,6 @@ struct FormatSettings std::unordered_set skip_row_groups = {}; bool output_string_as_string = false; bool output_fixed_string_as_fixed_byte_array = true; - // TODO: This should probably be shared among all formats and with - // https://github.com/ClickHouse/ClickHouse/issues/38755 bool preserve_order = false; UInt64 max_block_size = 8192; ParquetVersion output_version; diff --git a/src/IO/MMapReadBufferFromFileDescriptor.cpp b/src/IO/MMapReadBufferFromFileDescriptor.cpp index c0eb73f8638..9b1c132cc01 100644 --- a/src/IO/MMapReadBufferFromFileDescriptor.cpp +++ b/src/IO/MMapReadBufferFromFileDescriptor.cpp @@ -91,4 +91,15 @@ size_t MMapReadBufferFromFileDescriptor::getFileSize() { return getSizeFromFileDescriptor(getFD(), getFileName()); } + +size_t MMapReadBufferFromFileDescriptor::readBigAt(char * to, size_t n, size_t offset, const std::function &) +{ + if (offset >= mapped.getLength()) + return 0; + + n = std::min(n, mapped.getLength() - offset); + memcpy(to, mapped.getData() + offset, n); + return n; +} + } diff --git a/src/IO/MMapReadBufferFromFileDescriptor.h b/src/IO/MMapReadBufferFromFileDescriptor.h index 1a4bcd4f3ed..2a039e04971 100644 --- a/src/IO/MMapReadBufferFromFileDescriptor.h +++ b/src/IO/MMapReadBufferFromFileDescriptor.h @@ -39,6 +39,9 @@ public: int getFD() const; size_t getFileSize() override; + + size_t readBigAt(char * to, size_t n, size_t offset, const std::function &) override; + bool supportsReadAt() override { return true; } }; } diff --git a/src/IO/ParallelReadBuffer.cpp b/src/IO/ParallelReadBuffer.cpp index fff02db1bd6..07240ab3a4f 100644 --- a/src/IO/ParallelReadBuffer.cpp +++ b/src/IO/ParallelReadBuffer.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -13,51 +14,44 @@ namespace ErrorCodes } -// A subrange of the input, read by one SeekableReadBuffer. +// A subrange of the input, read by one thread. struct ParallelReadBuffer::ReadWorker { - ReadWorker(std::unique_ptr reader_, size_t offset_, size_t size) - : reader(std::move(reader_)), offset(offset_), bytes_left(size), range_end(offset + bytes_left) + ReadWorker(SeekableReadBuffer & input_, size_t offset, size_t size) + : input(input_), start_offset(offset), segment(size) { - assert(bytes_left); + chassert(size); + chassert(segment.size() == size); } - auto hasSegment() const { return current_segment_index < segments.size(); } + bool hasBytesToConsume() const { return bytes_produced > bytes_consumed; } + bool hasBytesToProduce() const { return bytes_produced < segment.size(); } - auto nextSegment() - { - assert(hasSegment()); - auto next_segment = std::move(segments[current_segment_index]); - ++current_segment_index; - offset += next_segment.size(); - return next_segment; - } + SeekableReadBuffer & input; + const size_t start_offset; // start of the segment - std::unique_ptr reader; - // Reader thread produces segments, nextImpl() consumes them. - std::vector> segments; // segments that were produced - size_t current_segment_index = 0; // first segment that's not consumed - bool finished{false}; // no more segments will be produced - size_t offset; // start of segments[current_segment_idx] - size_t bytes_left; // bytes left to produce above segments end - size_t range_end; // segments end + bytes_left, i.e. how far this worker will read - - // segments[current_segment_idx..end] range_end - // |-------------|--------------------------------------|------------| - // offset bytes_left + Memory<> segment; + /// Reader thread produces data, nextImpl() consumes it. + /// segment[bytes_consumed..bytes_produced-1] is data waiting to be picked up by nextImpl() + /// segment[bytes_produced..] needs to be read from the input ReadBuffer + size_t bytes_produced = 0; + size_t bytes_consumed = 0; std::atomic_bool cancel{false}; std::mutex worker_mutex; }; ParallelReadBuffer::ParallelReadBuffer( - std::unique_ptr reader_factory_, ThreadPoolCallbackRunner schedule_, size_t max_working_readers_, size_t range_step_) + SeekableReadBuffer & input_, ThreadPoolCallbackRunner schedule_, size_t max_working_readers_, size_t range_step_, size_t file_size_) : SeekableReadBuffer(nullptr, 0) , max_working_readers(max_working_readers_) , schedule(std::move(schedule_)) - , reader_factory(std::move(reader_factory_)) + , input(input_) + , file_size(file_size_) , range_step(std::max(1ul, range_step_)) { + LOG_TRACE(&Poco::Logger::get("ParallelReadBuffer"), "Parallel reading is used"); + try { addReaders(); @@ -71,22 +65,15 @@ ParallelReadBuffer::ParallelReadBuffer( bool ParallelReadBuffer::addReaderToPool() { - size_t file_size = reader_factory->getFileSize(); if (next_range_start >= file_size) return false; size_t range_start = next_range_start; size_t size = std::min(range_step, file_size - range_start); next_range_start += size; - auto reader = reader_factory->getReader(); - if (!reader) - { - return false; - } + auto worker = read_workers.emplace_back(std::make_shared(input, range_start, size)); - auto worker = read_workers.emplace_back(std::make_shared(std::move(reader), range_start, size)); - - ++active_working_reader; + ++active_working_readers; schedule([this, my_worker = std::move(worker)]() mutable { readerThreadFunction(std::move(my_worker)); }, Priority{}); return true; @@ -116,9 +103,9 @@ off_t ParallelReadBuffer::seek(off_t offset, int whence) } const auto offset_is_in_range - = [&](const auto & worker) { return static_cast(offset) >= worker->offset && static_cast(offset) < worker->range_end; }; + = [&](const auto & worker) { return static_cast(offset) >= worker->start_offset && static_cast(offset) < worker->start_offset + worker->segment.size(); }; - while (!read_workers.empty() && (offset < current_position || !offset_is_in_range(read_workers.front()))) + while (!read_workers.empty() && !offset_is_in_range(read_workers.front())) { read_workers.front()->cancel = true; read_workers.pop_front(); @@ -126,32 +113,31 @@ off_t ParallelReadBuffer::seek(off_t offset, int whence) if (!read_workers.empty()) { - auto & front_worker = read_workers.front(); - current_position = front_worker->offset; + auto & w = read_workers.front(); + size_t diff = static_cast(offset) - w->start_offset; while (true) { - std::unique_lock lock{front_worker->worker_mutex}; - next_condvar.wait(lock, [&] { return emergency_stop || front_worker->hasSegment(); }); + std::unique_lock lock{w->worker_mutex}; if (emergency_stop) handleEmergencyStop(); - auto next_segment = front_worker->nextSegment(); - current_position += next_segment.size(); - if (offset < current_position) + if (w->bytes_produced > diff) { - current_segment = std::move(next_segment); - working_buffer = internal_buffer = Buffer(current_segment.data(), current_segment.data() + current_segment.size()); - pos = working_buffer.end() - (current_position - offset); + working_buffer = internal_buffer = Buffer( + w->segment.data() + diff, w->segment.data() + w->bytes_produced); + w->bytes_consumed = w->bytes_produced; + current_position += w->start_offset + w->bytes_consumed; addReaders(); return offset; } + + next_condvar.wait_for(lock, std::chrono::seconds(10)); } } finishAndWait(); - all_completed = false; read_workers.clear(); next_range_start = offset; @@ -166,7 +152,7 @@ off_t ParallelReadBuffer::seek(off_t offset, int whence) size_t ParallelReadBuffer::getFileSize() { - return reader_factory->getFileSize(); + return file_size; } off_t ParallelReadBuffer::getPosition() @@ -174,17 +160,6 @@ off_t ParallelReadBuffer::getPosition() return current_position - available(); } -bool ParallelReadBuffer::currentWorkerReady() const -{ - assert(!read_workers.empty()); - return read_workers.front()->finished || read_workers.front()->hasSegment(); -} - -bool ParallelReadBuffer::currentWorkerCompleted() const -{ - return read_workers.front()->finished && !read_workers.front()->hasSegment(); -} - void ParallelReadBuffer::handleEmergencyStop() { // this can only be called from the main thread when there is an exception @@ -194,106 +169,99 @@ void ParallelReadBuffer::handleEmergencyStop() bool ParallelReadBuffer::nextImpl() { - if (all_completed) - return false; - while (true) { - std::unique_lock lock{read_workers.front()->worker_mutex}; - next_condvar.wait( - lock, - [this]() - { - /// Check if no more readers left or current reader can be processed - return emergency_stop || currentWorkerReady(); - }); - - bool worker_removed = false; - /// Remove completed units - while (currentWorkerCompleted() && !emergency_stop) - { - lock.unlock(); - read_workers.pop_front(); - worker_removed = true; - - if (read_workers.empty()) - break; - - lock = std::unique_lock{read_workers.front()->worker_mutex}; - } - - if (emergency_stop) - handleEmergencyStop(); - - if (worker_removed) - addReaders(); - /// All readers processed, stop if (read_workers.empty()) { - all_completed = true; + chassert(next_range_start >= file_size); return false; } - auto & front_worker = read_workers.front(); - /// Read data from first segment of the first reader - if (front_worker->hasSegment()) + auto * w = read_workers.front().get(); + + std::unique_lock lock{w->worker_mutex}; + + if (emergency_stop) + handleEmergencyStop(); // throws + + /// Read data from front reader + if (w->bytes_produced > w->bytes_consumed) { - current_segment = front_worker->nextSegment(); - if (currentWorkerCompleted()) - { - lock.unlock(); - read_workers.pop_front(); - all_completed = !addReaderToPool() && read_workers.empty(); - } - break; + chassert(w->start_offset + w->bytes_consumed == static_cast(current_position)); + + working_buffer = internal_buffer = Buffer( + w->segment.data() + w->bytes_consumed, w->segment.data() + w->bytes_produced); + current_position += working_buffer.size(); + w->bytes_consumed = w->bytes_produced; + + return true; } + + /// Front reader is done, remove it and add another + if (!w->hasBytesToProduce()) + { + lock.unlock(); + read_workers.pop_front(); + addReaders(); + + continue; + } + + /// Nothing to do right now, wait for something to change. + /// + /// The timeout is a workaround for a race condition. + /// emergency_stop is assigned while holding a *different* mutex from the one we're holding + /// (exception_mutex vs worker_mutex). So it's possible that our emergency_stop check (above) + /// happens before a onBackgroundException() call, but our wait(lock) happens after it. + /// Then the wait may get stuck forever. + /// + /// Note that using wait(lock, [&]{ return emergency_stop || ...; }) wouldn't help because + /// it does effectively the same "check, then wait" sequence. + /// + /// One possible proper fix would be to make onBackgroundException() lock all read_workers + /// mutexes too (not necessarily simultaneously - just locking+unlocking them one by one + /// between the emergency_stop change and the notify_all() would be enough), but then we + /// need another mutex to protect read_workers itself... + next_condvar.wait_for(lock, std::chrono::seconds(10)); } - working_buffer = internal_buffer = Buffer(current_segment.data(), current_segment.data() + current_segment.size()); - current_position += working_buffer.size(); - return true; + chassert(false); + return false; } void ParallelReadBuffer::readerThreadFunction(ReadWorkerPtr read_worker) { SCOPE_EXIT({ - if (active_working_reader.fetch_sub(1) == 1) - active_working_reader.notify_all(); + if (active_working_readers.fetch_sub(1) == 1) + active_working_readers.notify_all(); }); try { - read_worker->reader->setReadUntilPosition(read_worker->range_end); - read_worker->reader->seek(read_worker->offset, SEEK_SET); - - while (!emergency_stop && !read_worker->cancel) + auto on_progress = [&](size_t bytes_read) -> bool { - if (!read_worker->reader->next()) - throw Exception( - ErrorCodes::LOGICAL_ERROR, "Failed to read all the data from the reader, missing {} bytes", read_worker->bytes_left); - if (emergency_stop || read_worker->cancel) - break; + return true; - Buffer buffer = read_worker->reader->buffer(); - size_t bytes_to_copy = std::min(buffer.size(), read_worker->bytes_left); - Memory<> new_segment(bytes_to_copy); - memcpy(new_segment.data(), buffer.begin(), bytes_to_copy); - read_worker->reader->ignore(bytes_to_copy); - read_worker->bytes_left -= bytes_to_copy; - { - /// New data ready to be read - std::lock_guard lock(read_worker->worker_mutex); - read_worker->segments.emplace_back(std::move(new_segment)); - read_worker->finished = read_worker->bytes_left == 0; + std::lock_guard lock(read_worker->worker_mutex); + if (bytes_read <= read_worker->bytes_produced) + return false; + + bool need_notify = read_worker->bytes_produced == read_worker->bytes_consumed; + read_worker->bytes_produced = bytes_read; + if (need_notify) next_condvar.notify_all(); - } - if (read_worker->finished) - { - break; - } - } + return false; + }; + + size_t r = input.readBigAt(read_worker->segment.data(), read_worker->segment.size(), read_worker->start_offset); + + if (!on_progress(r) && r < read_worker->segment.size()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Failed to read all the data from the reader at offset {}, got {}/{} bytes", + read_worker->start_offset, r, read_worker->segment.size()); } catch (...) { @@ -315,12 +283,24 @@ void ParallelReadBuffer::finishAndWait() { emergency_stop = true; - size_t active_readers = active_working_reader.load(); + size_t active_readers = active_working_readers.load(); while (active_readers != 0) { - active_working_reader.wait(active_readers); - active_readers = active_working_reader.load(); + active_working_readers.wait(active_readers); + active_readers = active_working_readers.load(); } } +std::unique_ptr wrapInParallelReadBufferIfSupported( + ReadBuffer & buf, ThreadPoolCallbackRunner schedule, size_t max_working_readers, + size_t range_step, size_t file_size) +{ + auto * seekable = dynamic_cast(&buf); + if (!seekable || !seekable->supportsReadAt()) + return nullptr; + + return std::make_unique( + *seekable, schedule, max_working_readers, range_step, file_size); +} + } diff --git a/src/IO/ParallelReadBuffer.h b/src/IO/ParallelReadBuffer.h index 70f925f9735..e76b40f77b7 100644 --- a/src/IO/ParallelReadBuffer.h +++ b/src/IO/ParallelReadBuffer.h @@ -10,18 +10,17 @@ namespace DB { /** - * Reads from multiple ReadBuffers in parallel. - * Preserves order of readers obtained from SeekableReadBufferFactory. + * Reads from multiple positions in a ReadBuffer in parallel. + * Then reassembles the data into one stream in the original order. * - * It consumes multiple readers and yields data from them in order as it passed. - * Each working reader save segments of data to internal queue. + * Each working reader reads its segment of data into a buffer. * - * ParallelReadBuffer in nextImpl method take first available segment from first reader in deque and fed it to user. - * When first reader finish reading, they will be removed from worker deque and data from next reader consumed. + * ParallelReadBuffer in nextImpl method take first available segment from first reader in deque and reports it it to user. + * When first reader finishes reading, they will be removed from worker deque and data from next reader consumed. * * Number of working readers limited by max_working_readers. */ -class ParallelReadBuffer : public SeekableReadBuffer +class ParallelReadBuffer : public SeekableReadBuffer, public WithFileSize { private: /// Blocks until data occurred in the first reader or this reader indicate finishing @@ -29,19 +28,19 @@ private: bool nextImpl() override; public: - ParallelReadBuffer(SeekableReadBufferFactoryPtr reader_factory_, ThreadPoolCallbackRunner schedule_, size_t max_working_readers, size_t range_step_); + ParallelReadBuffer(SeekableReadBuffer & input, ThreadPoolCallbackRunner schedule_, size_t max_working_readers, size_t range_step_, size_t file_size); ~ParallelReadBuffer() override { finishAndWait(); } off_t seek(off_t off, int whence) override; - size_t getFileSize(); + size_t getFileSize() override; off_t getPosition() override; - const SeekableReadBufferFactory & getReadBufferFactory() const { return *reader_factory; } - SeekableReadBufferFactory & getReadBufferFactory() { return *reader_factory; } + const SeekableReadBuffer & getReadBuffer() const { return input; } + SeekableReadBuffer & getReadBuffer() { return input; } private: - /// Reader in progress with a list of read segments + /// Reader in progress with a buffer for the segment struct ReadWorker; using ReadWorkerPtr = std::shared_ptr; @@ -55,28 +54,28 @@ private: void addReaders(); bool addReaderToPool(); - /// Process read_worker, read data and save into internal segments queue + /// Process read_worker, read data and save into the buffer void readerThreadFunction(ReadWorkerPtr read_worker); void onBackgroundException(); void finishAndWait(); - Memory<> current_segment; - size_t max_working_readers; - std::atomic_size_t active_working_reader{0}; + std::atomic_size_t active_working_readers{0}; ThreadPoolCallbackRunner schedule; - std::unique_ptr reader_factory; + SeekableReadBuffer & input; + size_t file_size; size_t range_step; size_t next_range_start{0}; /** * FIFO queue of readers. - * Each worker contains reader itself and downloaded segments. - * When reader read all available data it will be removed from - * deque and data from next reader will be consumed to user. + * Each worker contains a buffer for the downloaded segment. + * After all data for the segment is read and delivered to the user, the reader will be removed + * from deque and data from next reader will be delivered. + * After removing from deque, call addReaders(). */ std::deque read_workers; @@ -92,4 +91,10 @@ private: bool all_completed{false}; }; +/// If `buf` is a SeekableReadBuffer with supportsReadAt() == true, creates a ParallelReadBuffer +/// from it. Otherwise returns nullptr; +std::unique_ptr wrapInParallelReadBufferIfSupported( + ReadBuffer & buf, ThreadPoolCallbackRunner schedule, size_t max_working_readers, + size_t range_step, size_t file_size); + } diff --git a/src/IO/ReadBufferFromFileDescriptor.cpp b/src/IO/ReadBufferFromFileDescriptor.cpp index bf44d9d10da..67bc01279c3 100644 --- a/src/IO/ReadBufferFromFileDescriptor.cpp +++ b/src/IO/ReadBufferFromFileDescriptor.cpp @@ -50,30 +50,30 @@ std::string ReadBufferFromFileDescriptor::getFileName() const } -bool ReadBufferFromFileDescriptor::nextImpl() +size_t ReadBufferFromFileDescriptor::readImpl(char * to, size_t min_bytes, size_t max_bytes, size_t offset) { - /// If internal_buffer size is empty, then read() cannot be distinguished from EOF - assert(!internal_buffer.empty()); + chassert(min_bytes <= max_bytes); - /// This is a workaround of a read pass EOF bug in linux kernel with pread() - if (file_size.has_value() && file_offset_of_buffer_end >= *file_size) - return false; + /// This is a workaround of a read past EOF bug in linux kernel with pread() + if (file_size.has_value() && offset >= *file_size) + return 0; size_t bytes_read = 0; - while (!bytes_read) + while (bytes_read < min_bytes) { ProfileEvents::increment(ProfileEvents::ReadBufferFromFileDescriptorRead); Stopwatch watch(profile_callback ? clock_type : CLOCK_MONOTONIC); ssize_t res = 0; + size_t to_read = max_bytes - bytes_read; { CurrentMetrics::Increment metric_increment{CurrentMetrics::Read}; if (use_pread) - res = ::pread(fd, internal_buffer.begin(), internal_buffer.size(), file_offset_of_buffer_end); + res = ::pread(fd, to + bytes_read, to_read, offset + bytes_read); else - res = ::read(fd, internal_buffer.begin(), internal_buffer.size()); + res = ::read(fd, to + bytes_read, to_read); } if (!res) break; @@ -102,18 +102,31 @@ bool ReadBufferFromFileDescriptor::nextImpl() if (profile_callback) { ProfileInfo info; - info.bytes_requested = internal_buffer.size(); + info.bytes_requested = to_read; info.bytes_read = res; info.nanoseconds = watch.elapsed(); profile_callback(info); } } + if (bytes_read) + ProfileEvents::increment(ProfileEvents::ReadBufferFromFileDescriptorReadBytes, bytes_read); + + return bytes_read; +} + + +bool ReadBufferFromFileDescriptor::nextImpl() +{ + /// If internal_buffer size is empty, then read() cannot be distinguished from EOF + assert(!internal_buffer.empty()); + + size_t bytes_read = readImpl(internal_buffer.begin(), 1, internal_buffer.size(), file_offset_of_buffer_end); + file_offset_of_buffer_end += bytes_read; if (bytes_read) { - ProfileEvents::increment(ProfileEvents::ReadBufferFromFileDescriptorReadBytes, bytes_read); working_buffer = internal_buffer; working_buffer.resize(bytes_read); } @@ -259,4 +272,17 @@ size_t ReadBufferFromFileDescriptor::getFileSize() return getSizeFromFileDescriptor(fd, getFileName()); } +bool ReadBufferFromFileDescriptor::checkIfActuallySeekable() +{ + struct stat stat; + auto res = ::fstat(fd, &stat); + return res == 0 && S_ISREG(stat.st_mode); +} + +size_t ReadBufferFromFileDescriptor::readBigAt(char * to, size_t n, size_t offset, const std::function &) +{ + chassert(use_pread); + return readImpl(to, n, n, offset); +} + } diff --git a/src/IO/ReadBufferFromFileDescriptor.h b/src/IO/ReadBufferFromFileDescriptor.h index 10f140275bb..64340770cf2 100644 --- a/src/IO/ReadBufferFromFileDescriptor.h +++ b/src/IO/ReadBufferFromFileDescriptor.h @@ -30,6 +30,12 @@ protected: /// Name or some description of file. std::string getFileName() const override; + /// Does the read()/pread(), with all the metric increments, error handling, throttling, etc. + /// Doesn't seek (`offset` must match fd's position if !use_pread). + /// Stops after min_bytes or eof. Returns 0 if eof. + /// Thread safe. + size_t readImpl(char * to, size_t min_bytes, size_t max_bytes, size_t offset); + public: explicit ReadBufferFromFileDescriptor( int fd_, @@ -65,6 +71,11 @@ public: size_t getFileSize() override; + bool checkIfActuallySeekable() override; + + size_t readBigAt(char * to, size_t n, size_t offset, const std::function &) override; + bool supportsReadAt() override { return use_pread; } + private: /// Assuming file descriptor supports 'select', check that we have data to read or wait until timeout. bool poll(size_t timeout_microseconds) const; diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index 93e2c46b080..d1cb1ec9ab0 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -109,9 +109,12 @@ bool ReadBufferFromS3::nextImpl() } size_t sleep_time_with_backoff_milliseconds = 100; - for (size_t attempt = 0; attempt < request_settings.max_single_read_retries && !next_result; ++attempt) + for (size_t attempt = 0; !next_result; ++attempt) { - Stopwatch watch; + bool last_attempt = attempt + 1 >= request_settings.max_single_read_retries; + + ProfileEventTimeIncrement watch(ProfileEvents::ReadBufferFromS3Microseconds); + try { if (!impl) @@ -133,44 +136,11 @@ bool ReadBufferFromS3::nextImpl() /// Try to read a next portion of data. next_result = impl->next(); - watch.stop(); - ProfileEvents::increment(ProfileEvents::ReadBufferFromS3Microseconds, watch.elapsedMicroseconds()); break; } catch (Exception & e) { - watch.stop(); - ProfileEvents::increment(ProfileEvents::ReadBufferFromS3Microseconds, watch.elapsedMicroseconds()); - ProfileEvents::increment(ProfileEvents::ReadBufferFromS3RequestsErrors, 1); - - if (auto * s3_exception = dynamic_cast(&e)) - { - /// It doesn't make sense to retry Access Denied or No Such Key - if (!s3_exception->isRetryableError()) - { - s3_exception->addMessage("while reading key: {}, from bucket: {}", key, bucket); - throw; - } - } - - /// It doesn't make sense to retry allocator errors - if (e.code() == ErrorCodes::CANNOT_ALLOCATE_MEMORY) - { - tryLogCurrentException(log); - throw; - } - - LOG_DEBUG( - log, - "Caught exception while reading S3 object. Bucket: {}, Key: {}, Version: {}, Offset: {}, Attempt: {}, Message: {}", - bucket, - key, - version_id.empty() ? "Latest" : version_id, - getPosition(), - attempt, - e.message()); - - if (attempt + 1 == request_settings.max_single_read_retries) + if (!processException(e, getPosition(), attempt) || last_attempt) throw; /// Pause before next attempt. @@ -197,6 +167,74 @@ bool ReadBufferFromS3::nextImpl() } +size_t ReadBufferFromS3::readBigAt(char * to, size_t n, size_t range_begin, const std::function & progress_callback) +{ + if (n == 0) + return 0; + + size_t sleep_time_with_backoff_milliseconds = 100; + for (size_t attempt = 0;; ++attempt) + { + bool last_attempt = attempt + 1 >= request_settings.max_single_read_retries; + + ProfileEventTimeIncrement watch(ProfileEvents::ReadBufferFromS3Microseconds); + + try + { + auto result = sendRequest(range_begin, range_begin + n - 1); + std::istream & istr = result.GetBody(); + + size_t bytes = copyFromIStreamWithProgressCallback(istr, to, n, progress_callback); + + ProfileEvents::increment(ProfileEvents::ReadBufferFromS3Bytes, bytes); + + if (read_settings.remote_throttler) + read_settings.remote_throttler->add(bytes, ProfileEvents::RemoteReadThrottlerBytes, ProfileEvents::RemoteReadThrottlerSleepMicroseconds); + + return bytes; + } + catch (Poco::Exception & e) + { + if (!processException(e, range_begin, attempt) || last_attempt) + throw; + + sleepForMilliseconds(sleep_time_with_backoff_milliseconds); + sleep_time_with_backoff_milliseconds *= 2; + } + } +} + +bool ReadBufferFromS3::processException(Poco::Exception & e, size_t read_offset, size_t attempt) const +{ + ProfileEvents::increment(ProfileEvents::ReadBufferFromS3RequestsErrors, 1); + + LOG_DEBUG( + log, + "Caught exception while reading S3 object. Bucket: {}, Key: {}, Version: {}, Offset: {}, " + "Attempt: {}, Message: {}", + bucket, key, version_id.empty() ? "Latest" : version_id, read_offset, attempt, e.message()); + + if (auto * s3_exception = dynamic_cast(&e)) + { + /// It doesn't make sense to retry Access Denied or No Such Key + if (!s3_exception->isRetryableError()) + { + s3_exception->addMessage("while reading key: {}, from bucket: {}", key, bucket); + return false; + } + } + + /// It doesn't make sense to retry allocator errors + if (e.code() == ErrorCodes::CANNOT_ALLOCATE_MEMORY) + { + tryLogCurrentException(log); + return false; + } + + return true; +} + + off_t ReadBufferFromS3::seek(off_t offset_, int whence) { if (offset_ == getPosition() && whence == SEEK_SET) @@ -315,44 +353,40 @@ bool ReadBufferFromS3::atEndOfRequestedRangeGuess() std::unique_ptr ReadBufferFromS3::initialize() { - S3::GetObjectRequest req; - req.SetBucket(bucket); - req.SetKey(key); - if (!version_id.empty()) - { - req.SetVersionId(version_id); - } - /** * If remote_filesystem_read_method = 'threadpool', then for MergeTree family tables * exact byte ranges to read are always passed here. */ - if (read_until_position) - { - if (offset >= read_until_position) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1); + if (read_until_position && offset >= read_until_position) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1); - req.SetRange(fmt::format("bytes={}-{}", offset, read_until_position - 1)); - LOG_TEST( - log, - "Read S3 object. Bucket: {}, Key: {}, Version: {}, Range: {}-{}", - bucket, - key, - version_id.empty() ? "Latest" : version_id, - offset, - read_until_position - 1); - } - else + read_result = sendRequest(offset, read_until_position ? std::make_optional(read_until_position - 1) : std::nullopt); + + size_t buffer_size = use_external_buffer ? 0 : read_settings.remote_fs_buffer_size; + return std::make_unique(read_result.GetBody(), buffer_size); +} + +Aws::S3::Model::GetObjectResult ReadBufferFromS3::sendRequest(size_t range_begin, std::optional range_end_incl) const +{ + S3::GetObjectRequest req; + req.SetBucket(bucket); + req.SetKey(key); + if (!version_id.empty()) + req.SetVersionId(version_id); + + if (range_end_incl) { - if (offset) - req.SetRange(fmt::format("bytes={}-", offset)); + req.SetRange(fmt::format("bytes={}-{}", range_begin, *range_end_incl)); LOG_TEST( - log, - "Read S3 object. Bucket: {}, Key: {}, Version: {}, Offset: {}", - bucket, - key, - version_id.empty() ? "Latest" : version_id, - offset); + log, "Read S3 object. Bucket: {}, Key: {}, Version: {}, Range: {}-{}", + bucket, key, version_id.empty() ? "Latest" : version_id, range_begin, *range_end_incl); + } + else if (range_begin) + { + req.SetRange(fmt::format("bytes={}-", range_begin)); + LOG_TEST( + log, "Read S3 object. Bucket: {}, Key: {}, Version: {}, Offset: {}", + bucket, key, version_id.empty() ? "Latest" : version_id, range_begin); } ProfileEvents::increment(ProfileEvents::S3GetObject); @@ -371,9 +405,7 @@ std::unique_ptr ReadBufferFromS3::initialize() { ResourceCost bytes_read = outcome.GetResult().GetContentLength(); read_settings.resource_link.adjust(estimated_cost, bytes_read); - size_t buffer_size = use_external_buffer ? 0 : read_settings.remote_fs_buffer_size; - read_result = outcome.GetResultWithOwnership(); - return std::make_unique(read_result.GetBody(), buffer_size); + return outcome.GetResultWithOwnership(); } else { @@ -383,21 +415,6 @@ std::unique_ptr ReadBufferFromS3::initialize() } } -std::unique_ptr ReadBufferS3Factory::getReader() -{ - return std::make_unique( - client_ptr, - bucket, - key, - version_id, - request_settings, - read_settings.adjustBufferSize(object_size)); -} - -size_t ReadBufferS3Factory::getFileSize() -{ - return object_size; -} } #endif diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 52dd74bdd14..0f665861a1e 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -77,12 +77,22 @@ public: String getFileName() const override { return bucket + "/" + key; } + size_t readBigAt(char * to, size_t n, size_t range_begin, const std::function & progress_callback) override; + + bool supportsReadAt() override { return true; } + private: std::unique_ptr initialize(); - // If true, if we destroy impl now, no work was wasted. Just for metrics. + /// If true, if we destroy impl now, no work was wasted. Just for metrics. bool atEndOfRequestedRangeGuess(); + /// Call inside catch() block if GetObject fails. Bumps metrics, logs the error. + /// Returns true if the error looks retriable. + bool processException(Poco::Exception & e, size_t read_offset, size_t attempt) const; + + Aws::S3::Model::GetObjectResult sendRequest(size_t range_begin, std::optional range_end_incl) const; + ReadSettings read_settings; bool use_external_buffer; @@ -92,43 +102,6 @@ private: bool restricted_seek; }; -/// Creates separate ReadBufferFromS3 for sequence of ranges of particular object -class ReadBufferS3Factory : public SeekableReadBufferFactory, public WithFileName -{ -public: - explicit ReadBufferS3Factory( - std::shared_ptr client_ptr_, - const String & bucket_, - const String & key_, - const String & version_id_, - size_t object_size_, - const S3Settings::RequestSettings & request_settings_, - const ReadSettings & read_settings_) - : client_ptr(client_ptr_) - , bucket(bucket_) - , key(key_) - , version_id(version_id_) - , read_settings(read_settings_) - , object_size(object_size_) - , request_settings(request_settings_) - {} - - std::unique_ptr getReader() override; - - size_t getFileSize() override; - - String getFileName() const override { return bucket + "/" + key; } - -private: - std::shared_ptr client_ptr; - const String bucket; - const String key; - const String version_id; - ReadSettings read_settings; - size_t object_size; - const S3Settings::RequestSettings request_settings; -}; - } #endif diff --git a/src/IO/ReadWriteBufferFromHTTP.cpp b/src/IO/ReadWriteBufferFromHTTP.cpp index 8c3ab704d2b..cf1159bfb4b 100644 --- a/src/IO/ReadWriteBufferFromHTTP.cpp +++ b/src/IO/ReadWriteBufferFromHTTP.cpp @@ -40,6 +40,12 @@ void UpdatableSession::updateSession(const Poco::URI & uri) throw Exception(ErrorCodes::TOO_MANY_REDIRECTS, "Too many redirects while trying to access {}", initial_uri.toString()); } +template +typename UpdatableSession::SessionPtr UpdatableSession::createDetachedSession(const Poco::URI & uri) +{ + return session_factory->buildNewSession(uri); +} + template std::shared_ptr> UpdatableSession::clone(const Poco::URI & uri) { @@ -89,21 +95,11 @@ bool ReadWriteBufferFromHTTPBase::withPartialContent(const } template -size_t ReadWriteBufferFromHTTPBase::getRangeBegin() const { return read_range.begin.value_or(0); } +size_t ReadWriteBufferFromHTTPBase::getOffset() const { return read_range.begin.value_or(0) + offset_from_begin_pos; } template -size_t ReadWriteBufferFromHTTPBase::getOffset() const { return getRangeBegin() + offset_from_begin_pos; } - -template -std::istream * ReadWriteBufferFromHTTPBase::callImpl( - UpdatableSessionPtr & current_session, Poco::URI uri_, Poco::Net::HTTPResponse & response, - const std::string & method_, bool for_object_info) +void ReadWriteBufferFromHTTPBase::prepareRequest(Poco::Net::HTTPRequest & request, Poco::URI uri_, std::optional range) const { - // With empty path poco will send "POST HTTP/1.1" its bug. - if (uri_.getPath().empty()) - uri_.setPath("/"); - - Poco::Net::HTTPRequest request(method_, uri_.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); request.setHost(uri_.getHost()); // use original, not resolved host name in header if (out_stream_callback) @@ -111,16 +107,9 @@ std::istream * ReadWriteBufferFromHTTPBase::callImpl( else if (method == Poco::Net::HTTPRequest::HTTP_POST) request.setContentLength(0); /// No callback - no body - for (auto & [header, value] : http_header_entries) + for (const auto & [header, value] : http_header_entries) request.set(header, value); - std::optional range; - if (!for_object_info) - { - if (withPartialContent(read_range)) - range = HTTPRange{getOffset(), read_range.end}; - } - if (range) { String range_header_value; @@ -134,6 +123,25 @@ std::istream * ReadWriteBufferFromHTTPBase::callImpl( if (!credentials.getUsername().empty()) credentials.authenticate(request); +} + +template +std::istream * ReadWriteBufferFromHTTPBase::callImpl( + UpdatableSessionPtr & current_session, Poco::URI uri_, Poco::Net::HTTPResponse & response, const std::string & method_, bool for_object_info) +{ + // With empty path poco will send "POST HTTP/1.1" its bug. + if (uri_.getPath().empty()) + uri_.setPath("/"); + + std::optional range; + if (!for_object_info) + { + if (withPartialContent(read_range)) + range = HTTPRange{getOffset(), read_range.end}; + } + + Poco::Net::HTTPRequest request(method_, uri_.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); + prepareRequest(request, uri_, range); LOG_TRACE(log, "Sending request to {}", uri_.toString()); @@ -176,6 +184,14 @@ size_t ReadWriteBufferFromHTTPBase::getFileSize() throw Exception(ErrorCodes::UNKNOWN_FILE_SIZE, "Cannot find out file size for: {}", uri.toString()); } +template +bool ReadWriteBufferFromHTTPBase::supportsReadAt() +{ + if (!file_info) + file_info = getFileInfo(); + return method == Poco::Net::HTTPRequest::HTTP_GET && file_info->seekable; +} + template bool ReadWriteBufferFromHTTPBase::checkIfActuallySeekable() { @@ -405,7 +421,7 @@ void ReadWriteBufferFromHTTPBase::initialize() { /// We could have range.begin == 0 and range.end != 0 in case of DiskWeb and failing to read with partial content /// will affect only performance, so a warning is enough. - LOG_WARNING(log, "Unable to read with range header: [{}, {}]", getRangeBegin(), *read_range.end); + LOG_WARNING(log, "Unable to read with range header: [{}, {}]", read_range.begin.value_or(0), *read_range.end); } } @@ -538,8 +554,8 @@ bool ReadWriteBufferFromHTTPBase::nextImpl() throw; /** Retry request unconditionally if nothing has been read yet. - * Otherwise if it is GET method retry with range header. - */ + * Otherwise if it is GET method retry with range header. + */ bool can_retry_request = !offset_from_begin_pos || method == Poco::Net::HTTPRequest::HTTP_GET; if (!can_retry_request) throw; @@ -574,6 +590,83 @@ bool ReadWriteBufferFromHTTPBase::nextImpl() return true; } +template +size_t ReadWriteBufferFromHTTPBase::readBigAt(char * to, size_t n, size_t offset, const std::function & progress_callback) +{ + /// Caller must have checked supportsReadAt(). + /// This ensures we've sent at least one HTTP request and populated saved_uri_redirect. + chassert(file_info && file_info->seekable); + + if (n == 0) + return 0; + + Poco::URI uri_ = saved_uri_redirect.value_or(uri); + if (uri_.getPath().empty()) + uri_.setPath("/"); + + size_t milliseconds_to_wait = settings.http_retry_initial_backoff_ms; + + for (size_t attempt = 0;; ++attempt) + { + bool last_attempt = attempt + 1 >= settings.http_max_tries; + + Poco::Net::HTTPRequest request(method, uri_.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); + prepareRequest(request, uri_, HTTPRange { .begin = offset, .end = offset + n - 1}); + + LOG_TRACE(log, "Sending request to {} for range [{}, {})", uri_.toString(), offset, offset + n); + + auto sess = session->createDetachedSession(uri_); + + Poco::Net::HTTPResponse response; + std::istream * result_istr; + + try + { + sess->sendRequest(request); + result_istr = receiveResponse(*sess, request, response, /*allow_redirects*/ false); + + if (response.getStatus() != Poco::Net::HTTPResponse::HTTPStatus::HTTP_PARTIAL_CONTENT && + (offset != 0 || offset + n < *file_info->file_size)) + throw Exception( + ErrorCodes::HTTP_RANGE_NOT_SATISFIABLE, + "Expected 206 Partial Content, got {} when reading {} range [{}, {})", + toString(response.getStatus()), uri_.toString(), offset, offset + n); + + bool cancelled; + size_t r = copyFromIStreamWithProgressCallback(*result_istr, to, n, progress_callback, &cancelled); + + return r; + } + catch (const Poco::Exception & e) + { + sess->attachSessionData(e.message()); + + LOG_ERROR( + log, + "HTTP request (positioned) to `{}` with range [{}, {}) failed at try {}/{}: {}", + uri_.toString(), offset, offset + n, attempt + 1, settings.http_max_tries, + e.what()); + + /// Decide whether to retry. + + if (last_attempt) + throw; + + /// Too many open files - non-retryable. + if (e.code() == POCO_EMFILE) + throw; + + if (const auto * h = dynamic_cast(&e); + h && !isRetriableError(static_cast(h->getHTTPStatus()))) + throw; + + sleepForMilliseconds(milliseconds_to_wait); + milliseconds_to_wait = std::min(milliseconds_to_wait * 2, settings.http_retry_max_backoff_ms); + continue; + } + } +} + template off_t ReadWriteBufferFromHTTPBase::getPosition() { return getOffset() - available(); } @@ -793,75 +886,6 @@ ReadWriteBufferFromHTTP::ReadWriteBufferFromHTTP( skip_not_found_url_, file_info_) {} -RangedReadWriteBufferFromHTTPFactory::RangedReadWriteBufferFromHTTPFactory( - Poco::URI uri_, - std::string method_, - OutStreamCallback out_stream_callback_, - ConnectionTimeouts timeouts_, - const Poco::Net::HTTPBasicCredentials & credentials_, - UInt64 max_redirects_, - size_t buffer_size_, - ReadSettings settings_, - HTTPHeaderEntries http_header_entries_, - const RemoteHostFilter * remote_host_filter_, - bool delay_initialization_, - bool use_external_buffer_, - bool skip_not_found_url_) - : uri(uri_) - , method(std::move(method_)) - , out_stream_callback(out_stream_callback_) - , timeouts(std::move(timeouts_)) - , credentials(credentials_) - , max_redirects(max_redirects_) - , buffer_size(buffer_size_) - , settings(std::move(settings_)) - , http_header_entries(std::move(http_header_entries_)) - , remote_host_filter(remote_host_filter_) - , delay_initialization(delay_initialization_) - , use_external_buffer(use_external_buffer_) - , skip_not_found_url(skip_not_found_url_) {} - -std::unique_ptr RangedReadWriteBufferFromHTTPFactory::getReader() -{ - return std::make_unique( - uri, - method, - out_stream_callback, - timeouts, - credentials, - max_redirects, - buffer_size, - settings, - http_header_entries, - remote_host_filter, - delay_initialization, - use_external_buffer, - skip_not_found_url, - file_info); -} - -size_t RangedReadWriteBufferFromHTTPFactory::getFileSize() -{ - auto s = getFileInfo().file_size; - if (!s) - throw Exception(ErrorCodes::UNKNOWN_FILE_SIZE, "Cannot find out file size for: {}", uri.toString()); - return *s; -} - -bool RangedReadWriteBufferFromHTTPFactory::checkIfActuallySeekable() -{ - return getFileInfo().seekable; -} - -HTTPFileInfo RangedReadWriteBufferFromHTTPFactory::getFileInfo() -{ - if (!file_info) - file_info = static_cast(getReader().get())->getFileInfo(); - return *file_info; -} - -String RangedReadWriteBufferFromHTTPFactory::getFileName() const { return uri.toString(); } - PooledSessionFactory::PooledSessionFactory( const ConnectionTimeouts & timeouts_, size_t per_endpoint_pool_size_) @@ -891,6 +915,7 @@ PooledReadWriteBufferFromHTTP::PooledReadWriteBufferFromHTTP( out_stream_callback_, buffer_size_) {} + template class UpdatableSession; template class UpdatableSession; template class detail::ReadWriteBufferFromHTTPBase>>; diff --git a/src/IO/ReadWriteBufferFromHTTP.h b/src/IO/ReadWriteBufferFromHTTP.h index de1946ced48..2d2ae5fe724 100644 --- a/src/IO/ReadWriteBufferFromHTTP.h +++ b/src/IO/ReadWriteBufferFromHTTP.h @@ -42,6 +42,9 @@ public: void updateSession(const Poco::URI & uri); + /// Thread safe. + SessionPtr createDetachedSession(const Poco::URI & uri); + std::shared_ptr> clone(const Poco::URI & uri); private: @@ -110,14 +113,16 @@ namespace detail bool withPartialContent(const HTTPRange & range) const; - size_t getRangeBegin() const; - size_t getOffset() const; + void prepareRequest(Poco::Net::HTTPRequest & request, Poco::URI uri_, std::optional range) const; + std::istream * callImpl(UpdatableSessionPtr & current_session, Poco::URI uri_, Poco::Net::HTTPResponse & response, const std::string & method_, bool for_object_info = false); size_t getFileSize() override; + bool supportsReadAt() override; + bool checkIfActuallySeekable() override; String getFileName() const override; @@ -171,6 +176,8 @@ namespace detail bool nextImpl() override; + size_t readBigAt(char * to, size_t n, size_t offset, const std::function & progress_callback) override; + off_t getPosition() override; off_t seek(off_t offset_, int whence) override; @@ -237,53 +244,6 @@ public: std::optional file_info_ = std::nullopt); }; -class RangedReadWriteBufferFromHTTPFactory : public SeekableReadBufferFactory, public WithFileName -{ - using OutStreamCallback = ReadWriteBufferFromHTTP::OutStreamCallback; - -public: - RangedReadWriteBufferFromHTTPFactory( - Poco::URI uri_, - std::string method_, - OutStreamCallback out_stream_callback_, - ConnectionTimeouts timeouts_, - const Poco::Net::HTTPBasicCredentials & credentials_, - UInt64 max_redirects_ = 0, - size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, - ReadSettings settings_ = {}, - HTTPHeaderEntries http_header_entries_ = {}, - const RemoteHostFilter * remote_host_filter_ = nullptr, - bool delay_initialization_ = true, - bool use_external_buffer_ = false, - bool skip_not_found_url_ = false); - - std::unique_ptr getReader() override; - - size_t getFileSize() override; - - bool checkIfActuallySeekable() override; - - HTTPFileInfo getFileInfo(); - - String getFileName() const override; - -private: - Poco::URI uri; - std::string method; - OutStreamCallback out_stream_callback; - ConnectionTimeouts timeouts; - const Poco::Net::HTTPBasicCredentials & credentials; - UInt64 max_redirects; - size_t buffer_size; - ReadSettings settings; - HTTPHeaderEntries http_header_entries; - const RemoteHostFilter * remote_host_filter; - std::optional file_info; - bool delay_initialization; - bool use_external_buffer; - bool skip_not_found_url; -}; - class PooledSessionFactory { public: @@ -292,7 +252,9 @@ public: using SessionType = PooledHTTPSessionPtr; + /// Thread safe. SessionType buildNewSession(const Poco::URI & uri); + private: ConnectionTimeouts timeouts; size_t per_endpoint_pool_size; @@ -315,6 +277,7 @@ public: size_t max_connections_per_endpoint = DEFAULT_COUNT_OF_HTTP_CONNECTIONS_PER_ENDPOINT); }; + extern template class UpdatableSession; extern template class UpdatableSession; extern template class detail::ReadWriteBufferFromHTTPBase>>; diff --git a/src/IO/SeekableReadBuffer.cpp b/src/IO/SeekableReadBuffer.cpp index 99c43d6671b..b83e382db01 100644 --- a/src/IO/SeekableReadBuffer.cpp +++ b/src/IO/SeekableReadBuffer.cpp @@ -3,6 +3,10 @@ namespace DB { +namespace ErrorCodes +{ + extern const int CANNOT_READ_FROM_ISTREAM; +} namespace { @@ -60,4 +64,46 @@ std::unique_ptr wrapSeekableReadBufferPointer(SeekableReadBu return std::make_unique>(*ptr, SeekableReadBufferPtr{ptr}); } +size_t copyFromIStreamWithProgressCallback(std::istream & istr, char * to, size_t n, const std::function & progress_callback, bool * out_cancelled) +{ + const size_t chunk = DBMS_DEFAULT_BUFFER_SIZE; + if (out_cancelled) + *out_cancelled = false; + + size_t copied = 0; + while (copied < n) + { + size_t to_copy = std::min(chunk, n - copied); + istr.read(to + copied, to_copy); + size_t gcount = istr.gcount(); + + copied += gcount; + + bool cancelled = false; + if (gcount && progress_callback) + cancelled = progress_callback(copied); + + if (gcount != to_copy) + { + if (!istr.eof()) + throw Exception( + ErrorCodes::CANNOT_READ_FROM_ISTREAM, + "{} at offset {}", + istr.fail() ? "Cannot read from istream" : "Unexpected state of istream", + copied); + + break; + } + + if (cancelled) + { + if (out_cancelled != nullptr) + *out_cancelled = true; + break; + } + } + + return copied; +} + } diff --git a/src/IO/SeekableReadBuffer.h b/src/IO/SeekableReadBuffer.h index 736ab5bbc71..8ced9d752de 100644 --- a/src/IO/SeekableReadBuffer.h +++ b/src/IO/SeekableReadBuffer.h @@ -59,39 +59,41 @@ public: /// * Sometimes when we create such buffer we don't know in advance whether we'll need it to be /// seekable or not. So we don't want to pay the price for this check in advance. virtual bool checkIfActuallySeekable() { return true; } + + /// Unbuffered positional read. + /// Doesn't affect the buffer state (position, working_buffer, etc). + /// + /// `progress_callback` may be called periodically during the read, reporting that to[0..m-1] + /// has been filled. If it returns true, reading is stopped, and readBigAt() returns bytes read + /// so far. Called only from inside readBigAt(), from the same thread, with increasing m. + /// + /// Stops either after n bytes, or at end of file, or on exception. Returns number of bytes read. + /// If offset is past the end of file, may return 0 or throw exception. + /// + /// Caller needs to be careful: + /// * supportsReadAt() must be checked (called and return true) before calling readBigAt(). + /// Otherwise readBigAt() may crash. + /// * Thread safety: multiple readBigAt() calls may be performed in parallel. + /// But readBigAt() may not be called in parallel with any other methods + /// (e.g. next() or supportsReadAt()). + /// * Performance: there's no buffering. Each readBigAt() call typically translates into actual + /// IO operation (e.g. HTTP request). Don't use it for small adjacent reads. + virtual size_t readBigAt(char * /*to*/, size_t /*n*/, size_t /*offset*/, const std::function & /*progress_callback*/ = nullptr) + { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method readBigAt() not implemented"); } + + /// Checks if readBigAt() is allowed. May be slow, may throw (e.g. it may do an HTTP request or an fstat). + virtual bool supportsReadAt() { return false; } }; -/// Useful for reading in parallel. -/// The created read buffers may outlive the factory. -/// -/// There are 2 ways to use this: -/// (1) Never call seek() or getFileSize(), read the file sequentially. -/// For HTTP, this usually translates to just one HTTP request. -/// (2) Call checkIfActuallySeekable(), then: -/// a. If it returned false, go to (1). seek() and getFileSize() are not available (throw if called). -/// b. If it returned true, seek() and getFileSize() are available, knock yourself out. -/// For HTTP, checkIfActuallySeekable() sends a HEAD request and returns false if the web server -/// doesn't support ranges (or doesn't support HEAD requests). -class SeekableReadBufferFactory : public WithFileSize -{ -public: - ~SeekableReadBufferFactory() override = default; - - // We usually call setReadUntilPosition() and seek() on the returned buffer before reading. - // So it's recommended that the returned implementation be lazy, i.e. don't start reading - // before the first call to nextImpl(). - virtual std::unique_ptr getReader() = 0; - - virtual bool checkIfActuallySeekable() { return true; } -}; using SeekableReadBufferPtr = std::shared_ptr; -using SeekableReadBufferFactoryPtr = std::unique_ptr; - /// Wraps a reference to a SeekableReadBuffer into an unique pointer to SeekableReadBuffer. /// This function is like wrapReadBufferReference() but for SeekableReadBuffer. std::unique_ptr wrapSeekableReadBufferReference(SeekableReadBuffer & ref); std::unique_ptr wrapSeekableReadBufferPointer(SeekableReadBufferPtr ptr); +/// Helper for implementing readBigAt(). +size_t copyFromIStreamWithProgressCallback(std::istream & istr, char * to, size_t n, const std::function & progress_callback, bool * out_cancelled = nullptr); + } diff --git a/src/IO/WithFileName.cpp b/src/IO/WithFileName.cpp index 9d9f264c861..2383182f7e7 100644 --- a/src/IO/WithFileName.cpp +++ b/src/IO/WithFileName.cpp @@ -19,7 +19,7 @@ String getFileNameFromReadBuffer(const ReadBuffer & in) if (const auto * compressed = dynamic_cast(&in)) return getFileName(compressed->getWrappedReadBuffer()); else if (const auto * parallel = dynamic_cast(&in)) - return getFileName(parallel->getReadBufferFactory()); + return getFileName(parallel->getReadBuffer()); else if (const auto * peekable = dynamic_cast(&in)) return getFileNameFromReadBuffer(peekable->getSubBuffer()); else diff --git a/src/IO/WithFileSize.cpp b/src/IO/WithFileSize.cpp index f71690fcdee..28542db7a73 100644 --- a/src/IO/WithFileSize.cpp +++ b/src/IO/WithFileSize.cpp @@ -33,10 +33,6 @@ size_t getFileSizeFromReadBuffer(ReadBuffer & in) { return getFileSize(compressed->getWrappedReadBuffer()); } - else if (auto * parallel = dynamic_cast(&in)) - { - return getFileSize(parallel->getReadBufferFactory()); - } return getFileSize(in); } @@ -51,10 +47,6 @@ bool isBufferWithFileSize(const ReadBuffer & in) { return isBufferWithFileSize(compressed->getWrappedReadBuffer()); } - else if (const auto * parallel = dynamic_cast(&in)) - { - return dynamic_cast(¶llel->getReadBufferFactory()) != nullptr; - } return dynamic_cast(&in) != nullptr; } diff --git a/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp b/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp index 54f3b76ff60..37505f94e98 100644 --- a/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp +++ b/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp @@ -146,45 +146,19 @@ arrow::Status ArrowInputStreamFromReadBuffer::Close() return arrow::Status(); } -RandomAccessFileFromManyReadBuffers::RandomAccessFileFromManyReadBuffers(SeekableReadBufferFactory & factory) : buf_factory(factory) {} +RandomAccessFileFromRandomAccessReadBuffer::RandomAccessFileFromRandomAccessReadBuffer(SeekableReadBuffer & in_, size_t file_size_) : in(in_), file_size(file_size_) {} -arrow::Result RandomAccessFileFromManyReadBuffers::GetSize() +arrow::Result RandomAccessFileFromRandomAccessReadBuffer::GetSize() { - return buf_factory.getFileSize(); + return file_size; } -arrow::Result RandomAccessFileFromManyReadBuffers::ReadAt(int64_t position, int64_t nbytes, void* out) +arrow::Result RandomAccessFileFromRandomAccessReadBuffer::ReadAt(int64_t position, int64_t nbytes, void* out) { - std::unique_lock lock(mutex); - if (free_bufs.empty()) - free_bufs.push_back(buf_factory.getReader()); - auto buf = std::move(free_bufs.back()); - free_bufs.pop_back(); - lock.unlock(); - - // To work well with this, ReadBuffer implementations need to respect setReadUntilPosition() and - // not read above it. We often do very small reads here. - // Also nice if they: - // * Make readBig() read directly into the provided memory, instead of copying from internal - // buffer. - // * Allocate the internal buffer (if any) lazily in first nextImpl() call. If all reads are - // tiny readBig() calls (as is typical here), it won't allocate an unnecessary 1 MB buffer. - - buf->seek(position, SEEK_SET); - buf->setReadUntilPosition(position + nbytes); - size_t bytes_read = buf->readBig(reinterpret_cast(out), nbytes); - - // Seeking to a position above a previous setReadUntilPosition() confuses some of the - // ReadBuffer implementations. So we reset it before next seek. - buf->setReadUntilEnd(); - - lock.lock(); - free_bufs.push_back(std::move(buf)); - - return static_cast(bytes_read); + return in.readBigAt(reinterpret_cast(out), nbytes, position); } -arrow::Result> RandomAccessFileFromManyReadBuffers::ReadAt(int64_t position, int64_t nbytes) +arrow::Result> RandomAccessFileFromRandomAccessReadBuffer::ReadAt(int64_t position, int64_t nbytes) { ARROW_ASSIGN_OR_RAISE(auto buffer, arrow::AllocateResizableBuffer(nbytes)) ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, ReadAt(position, nbytes, buffer->mutable_data())) @@ -195,22 +169,23 @@ arrow::Result> RandomAccessFileFromManyReadBuffer return buffer; } -arrow::Future> RandomAccessFileFromManyReadBuffers::ReadAsync(const arrow::io::IOContext&, int64_t position, int64_t nbytes) +arrow::Future> RandomAccessFileFromRandomAccessReadBuffer::ReadAsync(const arrow::io::IOContext&, int64_t position, int64_t nbytes) { return arrow::Future>::MakeFinished(ReadAt(position, nbytes)); } -arrow::Status RandomAccessFileFromManyReadBuffers::Close() +arrow::Status RandomAccessFileFromRandomAccessReadBuffer::Close() { chassert(is_open); is_open = false; return arrow::Status::OK(); } -arrow::Status RandomAccessFileFromManyReadBuffers::Seek(int64_t) { return arrow::Status::NotImplemented(""); } -arrow::Result RandomAccessFileFromManyReadBuffers::Tell() const { return arrow::Status::NotImplemented(""); } -arrow::Result RandomAccessFileFromManyReadBuffers::Read(int64_t, void*) { return arrow::Status::NotImplemented(""); } -arrow::Result> RandomAccessFileFromManyReadBuffers::Read(int64_t) { return arrow::Status::NotImplemented(""); } +arrow::Status RandomAccessFileFromRandomAccessReadBuffer::Seek(int64_t) { return arrow::Status::NotImplemented(""); } +arrow::Result RandomAccessFileFromRandomAccessReadBuffer::Tell() const { return arrow::Status::NotImplemented(""); } +arrow::Result RandomAccessFileFromRandomAccessReadBuffer::Read(int64_t, void*) { return arrow::Status::NotImplemented(""); } +arrow::Result> RandomAccessFileFromRandomAccessReadBuffer::Read(int64_t) { return arrow::Status::NotImplemented(""); } + std::shared_ptr asArrowFile( ReadBuffer & in, @@ -220,19 +195,16 @@ std::shared_ptr asArrowFile( const std::string & magic_bytes, bool avoid_buffering) { - if (auto * fd_in = dynamic_cast(&in)) + bool has_file_size = isBufferWithFileSize(in); + auto * seekable_in = dynamic_cast(&in); + + if (has_file_size && seekable_in && settings.seekable_read) { - struct stat stat; - auto res = ::fstat(fd_in->getFD(), &stat); - // if fd is a regular file i.e. not stdin - if (res == 0 && S_ISREG(stat.st_mode)) - return std::make_shared(*fd_in, stat.st_size, avoid_buffering); - } - else if (auto * seekable_in = dynamic_cast(&in); - seekable_in && settings.seekable_read && isBufferWithFileSize(in) && - seekable_in->checkIfActuallySeekable()) - { - return std::make_shared(in, std::nullopt, avoid_buffering); + if (avoid_buffering && seekable_in->supportsReadAt()) + return std::make_shared(*seekable_in, getFileSizeFromReadBuffer(in)); + + if (seekable_in->checkIfActuallySeekable()) + return std::make_shared(*seekable_in, std::nullopt, avoid_buffering); } // fallback to loading the entire file in memory @@ -245,26 +217,16 @@ std::shared_ptr asArrowFileLoadIntoMemory( const std::string & format_name, const std::string & magic_bytes) { - std::string file_data; - { - PeekableReadBuffer buf(in); - std::string magic_bytes_from_data; - magic_bytes_from_data.resize(magic_bytes.size()); - bool read_magic_bytes = false; - try - { - PeekableReadBufferCheckpoint checkpoint(buf, true); - buf.readStrict(magic_bytes_from_data.data(), magic_bytes_from_data.size()); - read_magic_bytes = true; - } - catch (const Exception &) {} + std::string file_data(magic_bytes.size(), '\0'); - if (!read_magic_bytes || magic_bytes_from_data != magic_bytes) - throw Exception(ErrorCodes::INCORRECT_DATA, "Not a {} file", format_name); + /// Avoid loading the whole file if it doesn't seem to even be in the correct format. + size_t bytes_read = in.read(file_data.data(), magic_bytes.size()); + if (bytes_read < magic_bytes.size() || file_data != magic_bytes) + throw Exception(ErrorCodes::INCORRECT_DATA, "Not a {} file", format_name); - WriteBufferFromString file_buffer(file_data); - copyData(buf, file_buffer, is_cancelled); - } + WriteBufferFromString file_buffer(file_data, AppendModeTag{}); + copyData(in, file_buffer, is_cancelled); + file_buffer.finalize(); return std::make_shared(arrow::Buffer::FromString(std::move(file_data))); } diff --git a/src/Processors/Formats/Impl/ArrowBufferedStreams.h b/src/Processors/Formats/Impl/ArrowBufferedStreams.h index 9307172cb11..f455bcdfb1a 100644 --- a/src/Processors/Formats/Impl/ArrowBufferedStreams.h +++ b/src/Processors/Formats/Impl/ArrowBufferedStreams.h @@ -18,7 +18,6 @@ class ReadBuffer; class WriteBuffer; class SeekableReadBuffer; -class SeekableReadBufferFactory; struct FormatSettings; class ArrowBufferedOutputStream : public arrow::io::OutputStream @@ -78,19 +77,17 @@ private: ARROW_DISALLOW_COPY_AND_ASSIGN(RandomAccessFileFromSeekableReadBuffer); }; -// Thread-safe. -// Maintains a pool of SeekableReadBuffer-s. For each ReadAt(), takes a buffer, seeks it, and reads. -class RandomAccessFileFromManyReadBuffers : public arrow::io::RandomAccessFile +class RandomAccessFileFromRandomAccessReadBuffer : public arrow::io::RandomAccessFile { public: - explicit RandomAccessFileFromManyReadBuffers(SeekableReadBufferFactory & factory); + explicit RandomAccessFileFromRandomAccessReadBuffer(SeekableReadBuffer & in_, size_t file_size_); // These are thread safe. arrow::Result GetSize() override; arrow::Result ReadAt(int64_t position, int64_t nbytes, void* out) override; arrow::Result> ReadAt(int64_t position, int64_t nbytes) override; - arrow::Future> ReadAsync(const arrow::io::IOContext&, int64_t position, - int64_t nbytes) override; + arrow::Future> ReadAsync( + const arrow::io::IOContext&, int64_t position, int64_t nbytes) override; // These are not thread safe, and arrow shouldn't call them. Return NotImplemented error. arrow::Status Seek(int64_t) override; @@ -102,13 +99,11 @@ public: bool closed() const override { return !is_open; } private: - SeekableReadBufferFactory & buf_factory; + SeekableReadBuffer & in; + size_t file_size; bool is_open = true; - std::mutex mutex; - std::vector> free_bufs; - - ARROW_DISALLOW_COPY_AND_ASSIGN(RandomAccessFileFromManyReadBuffers); + ARROW_DISALLOW_COPY_AND_ASSIGN(RandomAccessFileFromRandomAccessReadBuffer); }; class ArrowInputStreamFromReadBuffer : public arrow::io::InputStream diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 7f90c1197ce..2f3c68aa481 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -43,14 +43,12 @@ namespace ErrorCodes } while (false) ParquetBlockInputFormat::ParquetBlockInputFormat( - ReadBuffer * buf, - SeekableReadBufferFactoryPtr buf_factory_, + ReadBuffer & buf, const Block & header_, const FormatSettings & format_settings_, size_t max_decoding_threads_, size_t min_bytes_for_seek_) - : IInputFormat(header_, buf) - , buf_factory(std::move(buf_factory_)) + : IInputFormat(header_, &buf) , format_settings(format_settings_) , skip_row_groups(format_settings.parquet.skip_row_groups) , max_decoding_threads(max_decoding_threads_) @@ -71,17 +69,7 @@ void ParquetBlockInputFormat::initializeIfNeeded() // Create arrow file adapter. // TODO: Make the adapter do prefetching on IO threads, based on the full set of ranges that // we'll need to read (which we know in advance). Use max_download_threads for that. - if (buf_factory) - { - if (format_settings.seekable_read && buf_factory->checkIfActuallySeekable()) - arrow_file = std::make_shared(*buf_factory); - else - arrow_file = asArrowFileLoadIntoMemory(*buf_factory->getReader(), is_stopped, "Parquet", PARQUET_MAGIC_BYTES); - } - else - { - arrow_file = asArrowFile(*in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES, /* avoid_buffering */ true); - } + arrow_file = asArrowFile(*in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES, /* avoid_buffering */ true); if (is_stopped) return; @@ -388,7 +376,7 @@ ParquetSchemaReader::ParquetSchemaReader(ReadBuffer & in_, const FormatSettings NamesAndTypesList ParquetSchemaReader::readSchema() { std::atomic is_stopped{0}; - auto file = asArrowFile(in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES); + auto file = asArrowFile(in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES, /* avoid_buffering */ true); auto metadata = parquet::ReadMetaData(file); @@ -406,8 +394,7 @@ void registerInputFormatParquet(FormatFactory & factory) { factory.registerRandomAccessInputFormat( "Parquet", - [](ReadBuffer * buf, - SeekableReadBufferFactoryPtr buf_factory, + [](ReadBuffer & buf, const Block & sample, const FormatSettings & settings, const ReadSettings& read_settings, @@ -418,7 +405,6 @@ void registerInputFormatParquet(FormatFactory & factory) size_t min_bytes_for_seek = is_remote_fs ? read_settings.remote_read_min_bytes_for_seek : 8 * 1024; return std::make_shared( buf, - std::move(buf_factory), sample, settings, max_parsing_threads, diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index f17eee59414..ad7074547fc 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -15,7 +15,6 @@ namespace DB { class ArrowColumnToCHColumn; -class SeekableReadBufferFactory; // Parquet files contain a metadata block with the following information: // * list of columns, @@ -48,9 +47,7 @@ class ParquetBlockInputFormat : public IInputFormat { public: ParquetBlockInputFormat( - // exactly one of these two is nullptr - ReadBuffer * buf, - std::unique_ptr buf_factory, + ReadBuffer & buf, const Block & header, const FormatSettings & format_settings, size_t max_decoding_threads, @@ -234,7 +231,6 @@ private: }; }; - std::unique_ptr buf_factory; const FormatSettings format_settings; const std::unordered_set & skip_row_groups; size_t max_decoding_threads; diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 8380fa26a39..2d8aaec0f07 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -582,31 +582,11 @@ StorageS3Source::ReaderHolder StorageS3Source::createReader() size_t object_size = info ? info->size : S3::getObjectSize(*client, bucket, current_key, version_id, request_settings); auto compression_method = chooseCompressionMethod(current_key, compression_hint); - InputFormatPtr input_format; - std::unique_ptr owned_read_buf; - - auto read_buf_or_factory = createS3ReadBuffer(current_key, object_size); - if (read_buf_or_factory.buf_factory) - { - input_format = FormatFactory::instance().getInputRandomAccess( - format, - std::move(read_buf_or_factory.buf_factory), - sample_block, - getContext(), - max_block_size, - /* is_remote_fs */ true, - compression_method, - format_settings); - } - else - { - owned_read_buf = wrapReadBufferWithCompressionMethod( - std::move(read_buf_or_factory.buf), - compression_method, - static_cast(getContext()->getSettingsRef().zstd_window_log_max)); - input_format = FormatFactory::instance().getInput( - format, *owned_read_buf, sample_block, getContext(), max_block_size, format_settings); - } + auto read_buf = createS3ReadBuffer(current_key, object_size); + auto input_format = FormatFactory::instance().getInput( + format, *read_buf, sample_block, getContext(), max_block_size, + format_settings, std::nullopt, std::nullopt, + /* is_remote_fs */ true, compression_method); QueryPipelineBuilder builder; builder.init(Pipe(input_format)); @@ -621,7 +601,7 @@ StorageS3Source::ReaderHolder StorageS3Source::createReader() auto pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); auto current_reader = std::make_unique(*pipeline); - return ReaderHolder{fs::path(bucket) / current_key, std::move(owned_read_buf), std::move(pipeline), std::move(current_reader)}; + return ReaderHolder{fs::path(bucket) / current_key, std::move(read_buf), std::move(pipeline), std::move(current_reader)}; } std::future StorageS3Source::createReaderAsync() @@ -629,7 +609,7 @@ std::future StorageS3Source::createReaderAsync() return create_reader_scheduler([this] { return createReader(); }, Priority{}); } -StorageS3Source::ReadBufferOrFactory StorageS3Source::createS3ReadBuffer(const String & key, size_t object_size) +std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & key, size_t object_size) { auto read_settings = getContext()->getReadSettings().adjustBufferSize(object_size); read_settings.enable_filesystem_cache = false; @@ -642,12 +622,13 @@ StorageS3Source::ReadBufferOrFactory StorageS3Source::createS3ReadBuffer(const S if (object_too_small && read_settings.remote_fs_method == RemoteFSReadMethod::threadpool) { LOG_TRACE(log, "Downloading object of size {} from S3 with initial prefetch", object_size); - return {.buf = createAsyncS3ReadBuffer(key, read_settings, object_size)}; + return createAsyncS3ReadBuffer(key, read_settings, object_size); } - auto factory = std::make_unique( - client, bucket, key, version_id, object_size, request_settings, read_settings); - return {.buf_factory = std::move(factory)}; + return std::make_unique( + client, bucket, key, version_id, request_settings, read_settings, + /*use_external_buffer*/ false, /*offset_*/ 0, /*read_until_position_*/ 0, + /*restricted_seek_*/ false, object_size); } std::unique_ptr StorageS3Source::createAsyncS3ReadBuffer( diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 66652a45e3a..a4c120b99a6 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -204,12 +204,6 @@ private: std::unique_ptr reader; }; - struct ReadBufferOrFactory - { - std::unique_ptr buf; - SeekableReadBufferFactoryPtr buf_factory; - }; - ReaderHolder reader; std::vector requested_virtual_columns; @@ -230,7 +224,7 @@ private: ReaderHolder createReader(); std::future createReaderAsync(); - ReadBufferOrFactory createS3ReadBuffer(const String & key, size_t object_size); + std::unique_ptr createS3ReadBuffer(const String & key, size_t object_size); std::unique_ptr createAsyncS3ReadBuffer(const String & key, const ReadSettings & read_settings, size_t object_size); }; diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 4c5ed08e26e..efc44a069dd 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -248,7 +248,7 @@ StorageURLSource::StorageURLSource( throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty url list"); auto first_option = uri_options.begin(); - auto [actual_uri, buf_factory] = getFirstAvailableURIAndReadBuffer( + auto [actual_uri, buf] = getFirstAvailableURIAndReadBuffer( first_option, uri_options.end(), context, @@ -262,10 +262,11 @@ StorageURLSource::StorageURLSource( uri_options.size() == 1); curr_uri = actual_uri; + read_buf = std::move(buf); try { - total_size += buf_factory->getFileSize(); + total_size += getFileSizeFromReadBuffer(*read_buf); } catch (...) { @@ -273,16 +274,17 @@ StorageURLSource::StorageURLSource( } // TODO: Pass max_parsing_threads and max_download_threads adjusted for num_streams. - auto input_format = FormatFactory::instance().getInputRandomAccess( + auto input_format = FormatFactory::instance().getInput( format, - std::move(buf_factory), + *read_buf, sample_block, context, max_block_size, - /* is_remote_fs */ true, - compression_method, format_settings, - download_threads); + download_threads, + /*max_download_threads*/ std::nullopt, + /* is_remote_fs */ true, + compression_method); QueryPipelineBuilder builder; builder.init(Pipe(input_format)); @@ -348,7 +350,7 @@ Chunk StorageURLSource::generate() return {}; } -std::tuple StorageURLSource::getFirstAvailableURIAndReadBuffer( +std::tuple> StorageURLSource::getFirstAvailableURIAndReadBuffer( std::vector::const_iterator & option, const std::vector::const_iterator & end, ContextPtr context, @@ -376,40 +378,38 @@ std::tuple StorageURLSource::getFirstAv setCredentials(credentials, request_uri); const auto settings = context->getSettings(); - auto res = std::make_unique( - request_uri, - http_method, - callback, - timeouts, - credentials, - settings.max_http_get_redirects, - settings.max_read_buffer_size, - read_settings, - headers, - &context->getRemoteHostFilter(), - delay_initialization, - /* use_external_buffer */ false, - /* skip_url_not_found_error */ skip_url_not_found_error); - if (options > 1) + try { - // Send a HEAD request to check availability. - try - { - res->getFileInfo(); - } - catch (...) - { - if (first_exception_message.empty()) - first_exception_message = getCurrentExceptionMessage(false); + auto res = std::make_unique( + request_uri, + http_method, + callback, + timeouts, + credentials, + settings.max_http_get_redirects, + settings.max_read_buffer_size, + read_settings, + headers, + &context->getRemoteHostFilter(), + delay_initialization, + /* use_external_buffer */ false, + /* skip_url_not_found_error */ skip_url_not_found_error); - tryLogCurrentException(__PRETTY_FUNCTION__); - - continue; - } + return std::make_tuple(request_uri, std::move(res)); } + catch (...) + { + if (options == 1) + throw; - return std::make_tuple(request_uri, std::move(res)); + if (first_exception_message.empty()) + first_exception_message = getCurrentExceptionMessage(false); + + tryLogCurrentException(__PRETTY_FUNCTION__); + + continue; + } } throw Exception(ErrorCodes::NETWORK_ERROR, "All uri ({}) options are unreachable: {}", options, first_exception_message); @@ -598,7 +598,7 @@ ColumnsDescription IStorageURLBase::getTableStructureFromData( if (it == urls_to_check.cend()) return nullptr; - auto [_, buf_factory] = StorageURLSource::getFirstAvailableURIAndReadBuffer( + auto [_, buf] = StorageURLSource::getFirstAvailableURIAndReadBuffer( it, urls_to_check.cend(), context, @@ -612,7 +612,7 @@ ColumnsDescription IStorageURLBase::getTableStructureFromData( false); ++it; return wrapReadBufferWithCompressionMethod( - buf_factory->getReader(), + std::move(buf), compression_method, static_cast(context->getSettingsRef().zstd_window_log_max)); }; diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index d53b72105e4..316b142aec0 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -183,7 +183,7 @@ public: static Block getHeader(Block sample_block, const std::vector & requested_virtual_columns); - static std::tuple getFirstAvailableURIAndReadBuffer( + static std::tuple> getFirstAvailableURIAndReadBuffer( std::vector::const_iterator & option, const std::vector::const_iterator & end, ContextPtr context, @@ -205,6 +205,7 @@ private: std::shared_ptr uri_iterator; Poco::URI curr_uri; + std::unique_ptr read_buf; std::unique_ptr pipeline; std::unique_ptr reader; diff --git a/tests/integration/test_redirect_url_storage/test.py b/tests/integration/test_redirect_url_storage/test.py index 06ff78707d7..b2178655444 100644 --- a/tests/integration/test_redirect_url_storage/test.py +++ b/tests/integration/test_redirect_url_storage/test.py @@ -151,7 +151,7 @@ def test_url_reconnect(started_cluster): result = node1.query( "select sum(cityHash64(id)) from url('http://hdfs1:50075/webhdfs/v1/storage_big?op=OPEN&namenoderpcaddress=hdfs1:9000&offset=0', 'TSV', 'id Int32') settings http_max_tries = 10, http_retry_max_backoff_ms=1000" ) - assert (int(result), 6581218782194912115) + assert int(result) == 6581218782194912115 thread = threading.Thread(target=select) thread.start() @@ -161,5 +161,5 @@ def test_url_reconnect(started_cluster): thread.join() - assert (int(result), 6581218782194912115) + assert int(result) == 6581218782194912115 assert node1.contains_in_log("Timeout: connect timed out") From 7398b22fa5fc10f015be31035a65f3e6f5bd379f Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Wed, 17 May 2023 10:42:52 +0800 Subject: [PATCH 0262/1072] Add redis storage --- src/CMakeLists.txt | 1 + src/Common/ErrorCodes.cpp | 1 + src/Storages/NamedCollectionsHelpers.h | 4 + src/Storages/StorageRedis.cpp | 231 ++++++++++++++++++ src/Storages/StorageRedis.h | 91 +++++++ src/TableFunctions/TableFunctionRedis.cpp | 88 +++++++ src/TableFunctions/TableFunctionRedis.h | 29 +++ src/TableFunctions/registerTableFunctions.cpp | 1 + src/TableFunctions/registerTableFunctions.h | 1 + 9 files changed, 447 insertions(+) create mode 100644 src/Storages/StorageRedis.cpp create mode 100644 src/Storages/StorageRedis.h create mode 100644 src/TableFunctions/TableFunctionRedis.cpp create mode 100644 src/TableFunctions/TableFunctionRedis.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 622e18d4ff7..6608d86b5ed 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -412,6 +412,7 @@ dbms_target_link_libraries ( boost::system clickhouse_common_io Poco::MongoDB + Poco::Redis ) if (TARGET ch::mysqlxx) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 83a7314ac7a..505cf0aac8f 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -580,6 +580,7 @@ M(695, ASYNC_LOAD_FAILED) \ M(696, ASYNC_LOAD_CANCELED) \ M(697, CANNOT_RESTORE_TO_NONENCRYPTED_DISK) \ + M(698, INVALID_REDIS_STORAGE_TYPE) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Storages/NamedCollectionsHelpers.h b/src/Storages/NamedCollectionsHelpers.h index 1473a3fbe48..d0d6a526f9b 100644 --- a/src/Storages/NamedCollectionsHelpers.h +++ b/src/Storages/NamedCollectionsHelpers.h @@ -36,6 +36,10 @@ struct MongoDBEqualKeysSet static constexpr std::array, 4> equal_keys{ std::pair{"username", "user"}, std::pair{"database", "db"}, std::pair{"hostname", "host"}, std::pair{"table", "collection"}}; }; +struct RedisEqualKeysSet +{ + static constexpr std::array, 4> equal_keys{std::pair{"hostname", "host"}}; +}; template struct NamedCollectionValidateKey { diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp new file mode 100644 index 00000000000..1daeed255ea --- /dev/null +++ b/src/Storages/StorageRedis.cpp @@ -0,0 +1,231 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int INVALID_REDIS_STORAGE_TYPE; + extern const int NOT_IMPLEMENTED; +} + +StorageRedis::StorageRedis( + const StorageID & table_id_, + const Configuration & configuration_, + const ColumnsDescription & columns_, + const ConstraintsDescription & constraints_, + const String & comment_) : ta +{ + +} + + +Pipe StorageRedis::read( + const Names & column_names, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & /*query_info*/, + ContextPtr /*context*/, + QueryProcessingStage::Enum /*processed_stage*/, + size_t max_block_size, + size_t /*num_streams*/) +{ + connectIfNotConnected(); + + storage_snapshot->check(column_names); + + Block sample_block; + for (const String & column_name : column_names) + { + auto column_data = storage_snapshot->metadata->getColumns().getPhysical(column_name); + sample_block.insert({column_data.type, column_data.name}); + } + + return Pipe(std::make_shared( + connection, createCursor(database_name, collection_name, sample_block), sample_block, max_block_size)); +} + + +SinkToStoragePtr StorageRedis::write( + const ASTPtr & /*query*/, + const StorageMetadataPtr & /*metadata_snapshot*/, + ContextPtr /*context*/) +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method write is unsupported for StorageRedis"); +} + +StorageRedis::Configuration StorageRedis::getConfiguration(ASTs engine_args, ContextPtr context) +{ + Configuration configuration; + + if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, context)) + { + validateNamedCollection( + *named_collection, + ValidateKeysMultiset{"host", "port", "hostname", "password", "db_id", "storage_type"}, + {}); + + configuration.host = named_collection->getAny({"host", "hostname"}); + configuration.port = static_cast(named_collection->get("port")); + configuration.password = named_collection->get("password"); + configuration.db_id = named_collection->getAny({"db_id"}); + configuration.storage_type = toStorageType(named_collection->getOrDefault("storage_type", "")); + } + else + { + for (auto & engine_arg : engine_args) + engine_arg = evaluateConstantExpressionOrIdentifierAsLiteral(engine_arg, context); + + /// 6379 is the default Redis port. + auto parsed_host_port = parseAddress(checkAndGetLiteralArgument(engine_args[0], "host:port"), 6379); + + configuration.host = parsed_host_port.first; + configuration.port = parsed_host_port.second; + configuration.db_id = checkAndGetLiteralArgument(engine_args[1], "db_id"); + configuration.password = checkAndGetLiteralArgument(engine_args[2], "password"); + configuration.storage_type = toStorageType(checkAndGetLiteralArgument(engine_args[3], "storage_type")); + } + + context->getRemoteHostFilter().checkHostAndPort(configuration.host, toString(configuration.port)); + + return configuration; +} + +void StorageRedis::connectIfNotConnected() +{ + +} + + +class StorageRedisSink : public SinkToStorage +{ +public: + explicit StorageRedisSink( + const std::string & collection_name_, + const std::string & db_name_, + const StorageMetadataPtr & metadata_snapshot_, + std::shared_ptr connection_) + : SinkToStorage(metadata_snapshot_->getSampleBlock()) + , collection_name(collection_name_) + , db_name(db_name_) + , metadata_snapshot{metadata_snapshot_} + , connection(connection_) + { + } + + String getName() const override { return "StorageRedisSink"; } + + void consume(Chunk chunk) override + { + Poco::MongoDB::Database db(db_name); + Poco::MongoDB::Document::Ptr index = new Poco::MongoDB::Document(); + + auto block = getHeader().cloneWithColumns(chunk.detachColumns()); + + size_t num_rows = block.rows(); + size_t num_cols = block.columns(); + + const auto columns = block.getColumns(); + const auto data_types = block.getDataTypes(); + const auto data_names = block.getNames(); + + std::vector row(num_cols); + for (const auto i : collections::range(0, num_rows)) + { + for (const auto j : collections::range(0, num_cols)) + { + WriteBufferFromOwnString ostr; + data_types[j]->getDefaultSerialization()->serializeText(*columns[j], i, ostr, FormatSettings{}); + row[j] = ostr.str(); + index->add(data_names[j], row[j]); + } + } + Poco::SharedPtr insert_request = db.createInsertRequest(collection_name); + insert_request->documents().push_back(index); + connection->sendRequest(*insert_request); + } + +private: + String collection_name; + String db_name; + StorageMetadataPtr metadata_snapshot; + std::shared_ptr connection; +}; + + +using StorageType = StorageRedis::StorageType; + +String StorageRedis::toString(StorageType storage_type) +{ + static const std::unordered_map type_to_str_map + = {{StorageType::SIMPLE, "simple"}, + {StorageType::LIST, "list"}, + {StorageType::SET, "set"}, + {StorageType::HASH, "hash"}, + {StorageType::ZSET, "zset"}}; + + auto iter = type_to_str_map.find(storage_type); + return iter->second; +} + +StorageType StorageRedis::toStorageType(const String & storage_type) +{ + static const std::unordered_map str_to_type_map + = {{"simple", StorageType::SIMPLE}, + {"list", StorageType::LIST}, + {"set", StorageType::SET}, + {"hash", StorageType::HASH}, + {"zset", StorageType::ZSET}}; + + auto iter = str_to_type_map.find(storage_type); + if (iter == str_to_type_map.end()) + { + throw Exception(ErrorCodes::INVALID_REDIS_STORAGE_TYPE, "invalid redis storage type: {}", storage_type); + } + return iter->second; +} + +void registerStorageRedis(StorageFactory & factory) +{ + factory.registerStorage( + "MongoDB", + [](const StorageFactory::Arguments & args) + { + auto configuration = StorageRedis::getConfiguration(args.engine_args, args.getLocalContext()); + + return std::make_shared( + args.table_id, + configuration.host, + configuration.port, + configuration.database, + configuration.table, + configuration.username, + configuration.password, + configuration.options, + args.columns, + args.constraints, + args.comment); + }, + { + .source_access_type = AccessType::MONGO, + }); +} + +} diff --git a/src/Storages/StorageRedis.h b/src/Storages/StorageRedis.h new file mode 100644 index 00000000000..8ba4ec831bb --- /dev/null +++ b/src/Storages/StorageRedis.h @@ -0,0 +1,91 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ +/* Implements storage in the Redis. + * Use ENGINE = Redis(host:port, db_id, password, storage_type); + * Read only. + */ +class StorageRedis : public IStorage +{ +public: + enum class StorageType + { + SIMPLE, + LIST, + SET, + HASH, + ZSET + }; + + static String toString(StorageType storage_type); + static StorageType toStorageType(const String & storage_type); + + struct Configuration + { + String host; + uint32_t port; + String db_id; + String password; + StorageType storage_type; + }; + + using RedisArray = Poco::Redis::Array; + using RedisCommand = Poco::Redis::Command; + + using ClientPtr = std::unique_ptr; + using Pool = BorrowedObjectPool; + using PoolPtr = std::shared_ptr; + + struct Connection + { + Connection(PoolPtr pool_, ClientPtr client_); + ~Connection(); + + PoolPtr pool; + ClientPtr client; + }; + + using ConnectionPtr = std::unique_ptr; + + static Configuration getConfiguration(ASTs engine_args, ContextPtr context); + + StorageRedis( + const StorageID & table_id_, + const Configuration & configuration_, + const ColumnsDescription & columns_, + const ConstraintsDescription & constraints_, + const String & comment_); + + std::string getName() const override { return "Redis"; } + + Pipe read( + const Names & column_names, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & query_info, + ContextPtr context, + QueryProcessingStage::Enum processed_stage, + size_t max_block_size, + size_t num_streams) override; + + SinkToStoragePtr write( + const ASTPtr & query, + const StorageMetadataPtr & /*metadata_snapshot*/, + ContextPtr context) override; + +private: + Configuration configuration; + StorageID table_id; + ColumnsDescription columns; + ConstraintsDescription constraints; + String comment; + + std::shared_ptr connection; + void connectIfNotConnected(); +}; + +} diff --git a/src/TableFunctions/TableFunctionRedis.cpp b/src/TableFunctions/TableFunctionRedis.cpp new file mode 100644 index 00000000000..9432f766aa8 --- /dev/null +++ b/src/TableFunctions/TableFunctionRedis.cpp @@ -0,0 +1,88 @@ +#include + +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + + +StoragePtr TableFunctionRedis::executeImpl( + const ASTPtr & /*ast_function*/, ContextPtr context, const String & table_name, ColumnsDescription /*cached_columns*/) const +{ + auto columns = getActualTableStructure(context); + auto storage = std::make_shared( + StorageID(configuration->db_id, table_name), configuration, columns, ConstraintsDescription(), String{});// TODO + storage->startup(); + return storage; +} + +ColumnsDescription TableFunctionRedis::getActualTableStructure(ContextPtr context) const +{ + /// generate table structure by storage type. + String structure; + switch (configuration->storage_type) + { + case StorageRedis::StorageType::SIMPLE: + structure = "key String, value String"; + break; + case StorageRedis::StorageType::HASH: + structure = "key String, field, String, value String"; + break; + case StorageRedis::StorageType::LIST: + structure = "key String, value Array(String)"; + break; + case StorageRedis::StorageType::SET: + structure = "key String, value Array(String)"; + break; + case StorageRedis::StorageType::ZSET: + structure = "key String, value Array(String)"; + break; + } + return parseColumnsListFromString(structure, context); +} + +void TableFunctionRedis::parseArguments(const ASTPtr & ast_function, ContextPtr context) +{ + const auto & func_args = ast_function->as(); + if (!func_args.arguments) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table function 'redis' must have arguments."); + + ASTs & args = func_args.arguments->children; + + if (args.size() != 4) + { + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Table function 'Redis' requires from 4 parameters: " + "redis('host:port', db_id, 'password', 'storage_type')"); + } + configuration = StorageRedis::getConfiguration(args, context); +} + + +void registerTableFunctionRedis(TableFunctionFactory & factory) +{ + factory.registerFunction(); +} + +} diff --git a/src/TableFunctions/TableFunctionRedis.h b/src/TableFunctions/TableFunctionRedis.h new file mode 100644 index 00000000000..d333cd5a42f --- /dev/null +++ b/src/TableFunctions/TableFunctionRedis.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +class TableFunctionRedis : public ITableFunction +{ +public: + static constexpr auto name = "redis"; + String getName() const override { return name; } + +private: + StoragePtr executeImpl( + const ASTPtr & ast_function, ContextPtr context, + const String & table_name, ColumnsDescription cached_columns) const override; + + const char * getStorageTypeName() const override { return "Redis"; } + + ColumnsDescription getActualTableStructure(ContextPtr context) const override; + void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; + + std::optional configuration; +}; + +} diff --git a/src/TableFunctions/registerTableFunctions.cpp b/src/TableFunctions/registerTableFunctions.cpp index 4f3411df4c5..bfb83818f22 100644 --- a/src/TableFunctions/registerTableFunctions.cpp +++ b/src/TableFunctions/registerTableFunctions.cpp @@ -21,6 +21,7 @@ void registerTableFunctions() registerTableFunctionInput(factory); registerTableFunctionGenerate(factory); registerTableFunctionMongoDB(factory); + registerTableFunctionRedis(factory); registerTableFunctionMeiliSearch(factory); diff --git a/src/TableFunctions/registerTableFunctions.h b/src/TableFunctions/registerTableFunctions.h index c51522a5e99..cf0dee7f792 100644 --- a/src/TableFunctions/registerTableFunctions.h +++ b/src/TableFunctions/registerTableFunctions.h @@ -18,6 +18,7 @@ void registerTableFunctionValues(TableFunctionFactory & factory); void registerTableFunctionInput(TableFunctionFactory & factory); void registerTableFunctionGenerate(TableFunctionFactory & factory); void registerTableFunctionMongoDB(TableFunctionFactory & factory); +void registerTableFunctionRedis(TableFunctionFactory & factory); void registerTableFunctionMeiliSearch(TableFunctionFactory & factory); From e91867373cca6d91455b51af05575c48e6d1af9e Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Sat, 20 May 2023 11:48:57 +0800 Subject: [PATCH 0263/1072] Add table function Redis --- src/Access/Common/AccessType.h | 1 + src/Dictionaries/RedisDictionarySource.cpp | 80 +---------- src/Dictionaries/RedisDictionarySource.h | 44 +----- src/Dictionaries/RedisSource.cpp | 7 +- src/Dictionaries/RedisSource.h | 19 +-- src/Storages/RedisCommon.cpp | 98 +++++++++++++ src/Storages/RedisCommon.h | 61 ++++++++ src/Storages/StorageRedis.cpp | 156 +++++---------------- src/Storages/StorageRedis.h | 55 ++------ src/Storages/registerStorages.cpp | 2 + src/TableFunctions/TableFunctionRedis.cpp | 28 ++-- src/TableFunctions/TableFunctionRedis.h | 2 +- 12 files changed, 231 insertions(+), 322 deletions(-) create mode 100644 src/Storages/RedisCommon.cpp create mode 100644 src/Storages/RedisCommon.h diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index 6394c0279a7..78c341cdcb5 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -201,6 +201,7 @@ enum class AccessType M(URL, "", GLOBAL, SOURCES) \ M(REMOTE, "", GLOBAL, SOURCES) \ M(MONGO, "", GLOBAL, SOURCES) \ + M(Redis, "", GLOBAL, SOURCES) \ M(MEILISEARCH, "", GLOBAL, SOURCES) \ M(MYSQL, "", GLOBAL, SOURCES) \ M(POSTGRES, "", GLOBAL, SOURCES) \ diff --git a/src/Dictionaries/RedisDictionarySource.cpp b/src/Dictionaries/RedisDictionarySource.cpp index 6e4c5d1d5d9..db27801a38e 100644 --- a/src/Dictionaries/RedisDictionarySource.cpp +++ b/src/Dictionaries/RedisDictionarySource.cpp @@ -3,10 +3,6 @@ #include "DictionaryStructure.h" #include "registerDictionaries.h" -#include -#include -#include -#include #include #include #include @@ -52,7 +48,7 @@ namespace DB auto port = config.getUInt(redis_config_prefix + ".port"); global_context->getRemoteHostFilter().checkHostAndPort(host, toString(port)); - RedisDictionarySource::Configuration configuration = + RedisConfiguration configuration = { .host = host, .port = static_cast(port), @@ -68,26 +64,13 @@ namespace DB factory.registerSource("redis", create_table_source); } - RedisDictionarySource::Connection::Connection(PoolPtr pool_, ClientPtr client_) - : pool(std::move(pool_)), client(std::move(client_)) - { - } - - RedisDictionarySource::Connection::~Connection() - { - pool->returnObject(std::move(client)); - } - - static constexpr size_t REDIS_MAX_BLOCK_SIZE = DEFAULT_BLOCK_SIZE; - static constexpr size_t REDIS_LOCK_ACQUIRE_TIMEOUT_MS = 5000; - RedisDictionarySource::RedisDictionarySource( const DictionaryStructure & dict_struct_, - const Configuration & configuration_, + const RedisConfiguration & configuration_, const Block & sample_block_) : dict_struct{dict_struct_} , configuration(configuration_) - , pool(std::make_shared(configuration.pool_size)) + , pool(std::make_shared(configuration.pool_size)) , sample_block{sample_block_} { if (dict_struct.attributes.size() != 1) @@ -139,7 +122,7 @@ namespace DB QueryPipeline RedisDictionarySource::loadAll() { - auto connection = getConnection(); + auto connection = getRedisConnection(pool, configuration); RedisCommand command_for_keys("KEYS"); command_for_keys << "*"; @@ -195,7 +178,7 @@ namespace DB QueryPipeline RedisDictionarySource::loadIds(const std::vector & ids) { - auto connection = getConnection(); + auto connection = getRedisConnection(pool, configuration); if (configuration.storage_type == RedisStorageType::HASH_MAP) throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Cannot use loadIds with 'hash_map' storage type"); @@ -215,7 +198,7 @@ namespace DB QueryPipeline RedisDictionarySource::loadKeys(const Columns & key_columns, const std::vector & requested_rows) { - auto connection = getConnection(); + auto connection = getRedisConnection(pool, configuration); if (key_columns.size() != dict_struct.key->size()) throw Exception(ErrorCodes::LOGICAL_ERROR, "The size of key_columns does not equal to the size of dictionary key"); @@ -248,55 +231,4 @@ namespace DB return "Redis: " + configuration.host + ':' + DB::toString(configuration.port); } - RedisDictionarySource::ConnectionPtr RedisDictionarySource::getConnection() const - { - ClientPtr client; - bool ok = pool->tryBorrowObject(client, - [] { return std::make_unique(); }, - REDIS_LOCK_ACQUIRE_TIMEOUT_MS); - - if (!ok) - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, - "Could not get connection from pool, timeout exceeded {} seconds", - REDIS_LOCK_ACQUIRE_TIMEOUT_MS); - - if (!client->isConnected()) - { - try - { - client->connect(configuration.host, configuration.port); - - if (!configuration.password.empty()) - { - RedisCommand command("AUTH"); - command << configuration.password; - String reply = client->execute(command); - if (reply != "OK") - throw Exception(ErrorCodes::INTERNAL_REDIS_ERROR, - "Authentication failed with reason {}", reply); - } - - if (configuration.db_index != 0) - { - RedisCommand command("SELECT"); - command << std::to_string(configuration.db_index); - String reply = client->execute(command); - if (reply != "OK") - throw Exception(ErrorCodes::INTERNAL_REDIS_ERROR, - "Selecting database with index {} failed with reason {}", - configuration.db_index, reply); - } - } - catch (...) - { - if (client->isConnected()) - client->disconnect(); - - pool->returnObject(std::move(client)); - throw; - } - } - - return std::make_unique(pool, std::move(client)); - } } diff --git a/src/Dictionaries/RedisDictionarySource.h b/src/Dictionaries/RedisDictionarySource.h index 8fb6f93193b..c7786284dc4 100644 --- a/src/Dictionaries/RedisDictionarySource.h +++ b/src/Dictionaries/RedisDictionarySource.h @@ -5,6 +5,7 @@ #include "DictionaryStructure.h" #include "IDictionarySource.h" +#include namespace Poco { @@ -23,47 +24,12 @@ namespace DB extern const int NOT_IMPLEMENTED; } - enum class RedisStorageType - { - SIMPLE, - HASH_MAP, - UNKNOWN - }; - class RedisDictionarySource final : public IDictionarySource { public: - using RedisArray = Poco::Redis::Array; - using RedisCommand = Poco::Redis::Command; - - using ClientPtr = std::unique_ptr; - using Pool = BorrowedObjectPool; - using PoolPtr = std::shared_ptr; - - struct Configuration - { - const std::string host; - const UInt16 port; - const UInt32 db_index; - const std::string password; - const RedisStorageType storage_type; - const size_t pool_size; - }; - - struct Connection - { - Connection(PoolPtr pool_, ClientPtr client_); - ~Connection(); - - PoolPtr pool; - ClientPtr client; - }; - - using ConnectionPtr = std::unique_ptr; - RedisDictionarySource( const DictionaryStructure & dict_struct_, - const Configuration & configuration_, + const RedisConfiguration & configuration_, const Block & sample_block_); RedisDictionarySource(const RedisDictionarySource & other); @@ -92,12 +58,10 @@ namespace DB std::string toString() const override; private: - ConnectionPtr getConnection() const; - const DictionaryStructure dict_struct; - const Configuration configuration; + const RedisConfiguration configuration; - PoolPtr pool; + RedisPoolPtr pool; Block sample_block; }; } diff --git a/src/Dictionaries/RedisSource.cpp b/src/Dictionaries/RedisSource.cpp index 4622f65a1a9..9abaf7f0ac5 100644 --- a/src/Dictionaries/RedisSource.cpp +++ b/src/Dictionaries/RedisSource.cpp @@ -3,11 +3,6 @@ #include #include -#include -#include -#include -#include - #include #include #include @@ -30,7 +25,7 @@ namespace DB RedisSource::RedisSource( - ConnectionPtr connection_, + RedisConnectionPtr connection_, const RedisArray & keys_, const RedisStorageType & storage_type_, const DB::Block & sample_block, diff --git a/src/Dictionaries/RedisSource.h b/src/Dictionaries/RedisSource.h index 0f8cc317003..4537e496061 100644 --- a/src/Dictionaries/RedisSource.h +++ b/src/Dictionaries/RedisSource.h @@ -6,29 +6,18 @@ #include #include #include +#include #include "RedisDictionarySource.h" -namespace Poco -{ - namespace Redis - { - class Client; - } -} - namespace DB { class RedisSource final : public ISource { public: - using RedisArray = Poco::Redis::Array; - using RedisBulkString = Poco::Redis::BulkString; - using ConnectionPtr = RedisDictionarySource::ConnectionPtr; - RedisSource( - ConnectionPtr connection_, - const Poco::Redis::Array & keys_, + RedisConnectionPtr connection_, + const RedisArray & keys_, const RedisStorageType & storage_type_, const Block & sample_block, size_t max_block_size); @@ -40,7 +29,7 @@ namespace DB private: Chunk generate() override; - ConnectionPtr connection; + RedisConnectionPtr connection; Poco::Redis::Array keys; RedisStorageType storage_type; const size_t max_block_size; diff --git a/src/Storages/RedisCommon.cpp b/src/Storages/RedisCommon.cpp new file mode 100644 index 00000000000..397189e7485 --- /dev/null +++ b/src/Storages/RedisCommon.cpp @@ -0,0 +1,98 @@ +#include "RedisCommon.h" +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INVALID_REDIS_STORAGE_TYPE; + extern const int INTERNAL_REDIS_ERROR; + extern const int TIMEOUT_EXCEEDED; +} + +RedisConnection::RedisConnection(RedisPoolPtr pool_, RedisClientPtr client_) + : pool(std::move(pool_)), client(std::move(client_)) +{ +} + +RedisConnection::~RedisConnection() +{ + pool->returnObject(std::move(client)); +} + +String toString(RedisStorageType storage_type) +{ + static const std::unordered_map type_to_str_map + = {{RedisStorageType::SIMPLE, "simple"}, {RedisStorageType::HASH_MAP, "hash_map"}}; + + auto iter = type_to_str_map.find(storage_type); + return iter->second; +} + +RedisStorageType toRedisStorageType(const String & storage_type) +{ + static const std::unordered_map str_to_type_map + = {{"simple", RedisStorageType::SIMPLE}, {"hash", RedisStorageType::HASH_MAP}}; + + auto iter = str_to_type_map.find(storage_type); + if (iter == str_to_type_map.end()) + { + throw Exception(ErrorCodes::INVALID_REDIS_STORAGE_TYPE, "invalid redis storage type: {}", storage_type); + } + return iter->second; +} + +RedisConnectionPtr getRedisConnection(RedisPoolPtr pool, const RedisConfiguration & configuration) +{ + RedisClientPtr client; + bool ok = pool->tryBorrowObject(client, + [] { return std::make_unique(); }, + REDIS_LOCK_ACQUIRE_TIMEOUT_MS); + + if (!ok) + throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, + "Could not get connection from pool, timeout exceeded {} seconds", + REDIS_LOCK_ACQUIRE_TIMEOUT_MS); + + if (!client->isConnected()) + { + try + { + client->connect(configuration.host, configuration.port); + + if (!configuration.password.empty()) + { + RedisCommand command("AUTH"); + command << configuration.password; + String reply = client->execute(command); + if (reply != "OK") + throw Exception(ErrorCodes::INTERNAL_REDIS_ERROR, + "Authentication failed with reason {}", reply); + } + + if (configuration.db_index != 0) + { + RedisCommand command("SELECT"); + command << std::to_string(configuration.db_index); + String reply = client->execute(command); + if (reply != "OK") + throw Exception(ErrorCodes::INTERNAL_REDIS_ERROR, + "Selecting database with index {} failed with reason {}", + configuration.db_index, reply); + } + } + catch (...) + { + if (client->isConnected()) + client->disconnect(); + + pool->returnObject(std::move(client)); + throw; + } + } + + return std::make_unique(pool, std::move(client)); +} + +} diff --git a/src/Storages/RedisCommon.h b/src/Storages/RedisCommon.h new file mode 100644 index 00000000000..6069d3d9a0c --- /dev/null +++ b/src/Storages/RedisCommon.h @@ -0,0 +1,61 @@ +#pragma once + +#include +#include +#include + +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +static constexpr size_t REDIS_MAX_BLOCK_SIZE = DEFAULT_BLOCK_SIZE; +static constexpr size_t REDIS_LOCK_ACQUIRE_TIMEOUT_MS = 5000; + +enum class RedisStorageType +{ + SIMPLE, + HASH_MAP, + UNKNOWN +}; + +String toString(RedisStorageType storage_type); +RedisStorageType toRedisStorageType(const String & storage_type); + +struct RedisConfiguration +{ + String host; + uint32_t port; + uint32_t db_index; + String password; + RedisStorageType storage_type; + uint32_t pool_size; +}; + +using RedisArray = Poco::Redis::Array; +using RedisCommand = Poco::Redis::Command; +using RedisBulkString = Poco::Redis::BulkString; + +using RedisClientPtr = std::unique_ptr; +using RedisPool = BorrowedObjectPool; +using RedisPoolPtr = std::shared_ptr; + +struct RedisConnection +{ + RedisConnection(RedisPoolPtr pool_, RedisClientPtr client_); + ~RedisConnection(); + + RedisPoolPtr pool; + RedisClientPtr client; +}; + +using RedisConnectionPtr = std::unique_ptr; + +RedisConnectionPtr getRedisConnection(RedisPoolPtr pool, const RedisConfiguration & configuration) ; + +} diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index 1daeed255ea..055617b6a96 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -1,6 +1,6 @@ -#include -#include #include +#include +#include #include #include @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -23,21 +22,25 @@ namespace DB namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int INVALID_REDIS_STORAGE_TYPE; extern const int NOT_IMPLEMENTED; } StorageRedis::StorageRedis( const StorageID & table_id_, - const Configuration & configuration_, + const RedisConfiguration & configuration_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - const String & comment_) : ta + const String & comment_) + : IStorage(table_id_) + , table_id(table_id_) + , configuration(configuration_) + , columns(columns_) + , constraints(constraints_) + , comment(comment_) { - + pool = std::make_shared(configuration.pool_size); } - Pipe StorageRedis::read( const Names & column_names, const StorageSnapshotPtr & storage_snapshot, @@ -47,7 +50,7 @@ Pipe StorageRedis::read( size_t max_block_size, size_t /*num_streams*/) { - connectIfNotConnected(); + auto connection = getRedisConnection(pool, configuration); storage_snapshot->check(column_names); @@ -58,8 +61,14 @@ Pipe StorageRedis::read( sample_block.insert({column_data.type, column_data.name}); } - return Pipe(std::make_shared( - connection, createCursor(database_name, collection_name, sample_block), sample_block, max_block_size)); + RedisArray keys; + RedisCommand command_for_keys("KEYS"); + /// generate keys by table name prefix + command_for_keys << table_id.getTableName() + ":" + toString(configuration.storage_type) + ":*"; + + /// Get only keys for specified storage type. + auto all_keys = connection->client->execute(command_for_keys); + return Pipe(std::make_shared(std::move(connection), all_keys, configuration.storage_type, sample_block, max_block_size)); } @@ -71,22 +80,23 @@ SinkToStoragePtr StorageRedis::write( throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method write is unsupported for StorageRedis"); } -StorageRedis::Configuration StorageRedis::getConfiguration(ASTs engine_args, ContextPtr context) +RedisConfiguration StorageRedis::getConfiguration(ASTs engine_args, ContextPtr context) { - Configuration configuration; + RedisConfiguration configuration; if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, context)) { validateNamedCollection( *named_collection, - ValidateKeysMultiset{"host", "port", "hostname", "password", "db_id", "storage_type"}, + ValidateKeysMultiset{"host", "port", "hostname", "password", "db_index", "storage_type"}, {}); configuration.host = named_collection->getAny({"host", "hostname"}); - configuration.port = static_cast(named_collection->get("port")); + configuration.port = static_cast(named_collection->get("port")); configuration.password = named_collection->get("password"); - configuration.db_id = named_collection->getAny({"db_id"}); - configuration.storage_type = toStorageType(named_collection->getOrDefault("storage_type", "")); + configuration.db_index = static_cast(named_collection->get({"db_index"})); + configuration.storage_type = toRedisStorageType(named_collection->getOrDefault("storage_type", "")); + configuration.pool_size = 16; /// TODO } else { @@ -98,133 +108,33 @@ StorageRedis::Configuration StorageRedis::getConfiguration(ASTs engine_args, Con configuration.host = parsed_host_port.first; configuration.port = parsed_host_port.second; - configuration.db_id = checkAndGetLiteralArgument(engine_args[1], "db_id"); + configuration.db_index = static_cast(checkAndGetLiteralArgument(engine_args[1], "db_index")); configuration.password = checkAndGetLiteralArgument(engine_args[2], "password"); - configuration.storage_type = toStorageType(checkAndGetLiteralArgument(engine_args[3], "storage_type")); + configuration.storage_type = toRedisStorageType(checkAndGetLiteralArgument(engine_args[3], "storage_type")); + configuration.pool_size = 16; /// TODO } context->getRemoteHostFilter().checkHostAndPort(configuration.host, toString(configuration.port)); - return configuration; } -void StorageRedis::connectIfNotConnected() -{ - -} - - -class StorageRedisSink : public SinkToStorage -{ -public: - explicit StorageRedisSink( - const std::string & collection_name_, - const std::string & db_name_, - const StorageMetadataPtr & metadata_snapshot_, - std::shared_ptr connection_) - : SinkToStorage(metadata_snapshot_->getSampleBlock()) - , collection_name(collection_name_) - , db_name(db_name_) - , metadata_snapshot{metadata_snapshot_} - , connection(connection_) - { - } - - String getName() const override { return "StorageRedisSink"; } - - void consume(Chunk chunk) override - { - Poco::MongoDB::Database db(db_name); - Poco::MongoDB::Document::Ptr index = new Poco::MongoDB::Document(); - - auto block = getHeader().cloneWithColumns(chunk.detachColumns()); - - size_t num_rows = block.rows(); - size_t num_cols = block.columns(); - - const auto columns = block.getColumns(); - const auto data_types = block.getDataTypes(); - const auto data_names = block.getNames(); - - std::vector row(num_cols); - for (const auto i : collections::range(0, num_rows)) - { - for (const auto j : collections::range(0, num_cols)) - { - WriteBufferFromOwnString ostr; - data_types[j]->getDefaultSerialization()->serializeText(*columns[j], i, ostr, FormatSettings{}); - row[j] = ostr.str(); - index->add(data_names[j], row[j]); - } - } - Poco::SharedPtr insert_request = db.createInsertRequest(collection_name); - insert_request->documents().push_back(index); - connection->sendRequest(*insert_request); - } - -private: - String collection_name; - String db_name; - StorageMetadataPtr metadata_snapshot; - std::shared_ptr connection; -}; - - -using StorageType = StorageRedis::StorageType; - -String StorageRedis::toString(StorageType storage_type) -{ - static const std::unordered_map type_to_str_map - = {{StorageType::SIMPLE, "simple"}, - {StorageType::LIST, "list"}, - {StorageType::SET, "set"}, - {StorageType::HASH, "hash"}, - {StorageType::ZSET, "zset"}}; - - auto iter = type_to_str_map.find(storage_type); - return iter->second; -} - -StorageType StorageRedis::toStorageType(const String & storage_type) -{ - static const std::unordered_map str_to_type_map - = {{"simple", StorageType::SIMPLE}, - {"list", StorageType::LIST}, - {"set", StorageType::SET}, - {"hash", StorageType::HASH}, - {"zset", StorageType::ZSET}}; - - auto iter = str_to_type_map.find(storage_type); - if (iter == str_to_type_map.end()) - { - throw Exception(ErrorCodes::INVALID_REDIS_STORAGE_TYPE, "invalid redis storage type: {}", storage_type); - } - return iter->second; -} - void registerStorageRedis(StorageFactory & factory) { factory.registerStorage( - "MongoDB", + "Redis", [](const StorageFactory::Arguments & args) { auto configuration = StorageRedis::getConfiguration(args.engine_args, args.getLocalContext()); return std::make_shared( args.table_id, - configuration.host, - configuration.port, - configuration.database, - configuration.table, - configuration.username, - configuration.password, - configuration.options, + configuration, args.columns, args.constraints, args.comment); }, { - .source_access_type = AccessType::MONGO, + .source_access_type = AccessType::Redis, }); } diff --git a/src/Storages/StorageRedis.h b/src/Storages/StorageRedis.h index 8ba4ec831bb..60db75dd384 100644 --- a/src/Storages/StorageRedis.h +++ b/src/Storages/StorageRedis.h @@ -7,56 +7,15 @@ namespace DB { /* Implements storage in the Redis. - * Use ENGINE = Redis(host:port, db_id, password, storage_type); + * Use ENGINE = Redis(host:port, db_index, password, storage_type); * Read only. */ class StorageRedis : public IStorage { public: - enum class StorageType - { - SIMPLE, - LIST, - SET, - HASH, - ZSET - }; - - static String toString(StorageType storage_type); - static StorageType toStorageType(const String & storage_type); - - struct Configuration - { - String host; - uint32_t port; - String db_id; - String password; - StorageType storage_type; - }; - - using RedisArray = Poco::Redis::Array; - using RedisCommand = Poco::Redis::Command; - - using ClientPtr = std::unique_ptr; - using Pool = BorrowedObjectPool; - using PoolPtr = std::shared_ptr; - - struct Connection - { - Connection(PoolPtr pool_, ClientPtr client_); - ~Connection(); - - PoolPtr pool; - ClientPtr client; - }; - - using ConnectionPtr = std::unique_ptr; - - static Configuration getConfiguration(ASTs engine_args, ContextPtr context); - StorageRedis( const StorageID & table_id_, - const Configuration & configuration_, + const RedisConfiguration & configuration_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, const String & comment_); @@ -77,15 +36,17 @@ public: const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + static RedisConfiguration getConfiguration(ASTs engine_args, ContextPtr context); + private: - Configuration configuration; StorageID table_id; + RedisConfiguration configuration; + ColumnsDescription columns; ConstraintsDescription constraints; - String comment; - std::shared_ptr connection; - void connectIfNotConnected(); + String comment; + RedisPoolPtr pool; }; } diff --git a/src/Storages/registerStorages.cpp b/src/Storages/registerStorages.cpp index 8be176a5375..84994298b8e 100644 --- a/src/Storages/registerStorages.cpp +++ b/src/Storages/registerStorages.cpp @@ -59,6 +59,7 @@ void registerStorageMySQL(StorageFactory & factory); #endif void registerStorageMongoDB(StorageFactory & factory); +void registerStorageRedis(StorageFactory & factory); #if USE_RDKAFKA @@ -156,6 +157,7 @@ void registerStorages() #endif registerStorageMongoDB(factory); + registerStorageRedis(factory); #if USE_RDKAFKA registerStorageKafka(factory); diff --git a/src/TableFunctions/TableFunctionRedis.cpp b/src/TableFunctions/TableFunctionRedis.cpp index 9432f766aa8..db612806652 100644 --- a/src/TableFunctions/TableFunctionRedis.cpp +++ b/src/TableFunctions/TableFunctionRedis.cpp @@ -1,17 +1,15 @@ #include #include +#include #include -#include #include #include -#include #include #include -#include #include #include @@ -23,6 +21,7 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int INVALID_REDIS_STORAGE_TYPE; } @@ -31,7 +30,11 @@ StoragePtr TableFunctionRedis::executeImpl( { auto columns = getActualTableStructure(context); auto storage = std::make_shared( - StorageID(configuration->db_id, table_name), configuration, columns, ConstraintsDescription(), String{});// TODO + StorageID(toString(configuration->db_index), table_name), // TODO + *configuration, + columns, + ConstraintsDescription(), + String{}); storage->startup(); return storage; } @@ -42,21 +45,14 @@ ColumnsDescription TableFunctionRedis::getActualTableStructure(ContextPtr contex String structure; switch (configuration->storage_type) { - case StorageRedis::StorageType::SIMPLE: + case RedisStorageType::SIMPLE: structure = "key String, value String"; break; - case StorageRedis::StorageType::HASH: + case RedisStorageType::HASH_MAP: structure = "key String, field, String, value String"; break; - case StorageRedis::StorageType::LIST: - structure = "key String, value Array(String)"; - break; - case StorageRedis::StorageType::SET: - structure = "key String, value Array(String)"; - break; - case StorageRedis::StorageType::ZSET: - structure = "key String, value Array(String)"; - break; + case RedisStorageType::UNKNOWN: + throw Exception(ErrorCodes::INVALID_REDIS_STORAGE_TYPE, "invalid redis storage type."); } return parseColumnsListFromString(structure, context); } @@ -74,7 +70,7 @@ void TableFunctionRedis::parseArguments(const ASTPtr & ast_function, ContextPtr throw Exception( ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function 'Redis' requires from 4 parameters: " - "redis('host:port', db_id, 'password', 'storage_type')"); + "redis('host:port', db_index, 'password', 'storage_type')"); } configuration = StorageRedis::getConfiguration(args, context); } diff --git a/src/TableFunctions/TableFunctionRedis.h b/src/TableFunctions/TableFunctionRedis.h index d333cd5a42f..5c6f483fda7 100644 --- a/src/TableFunctions/TableFunctionRedis.h +++ b/src/TableFunctions/TableFunctionRedis.h @@ -23,7 +23,7 @@ private: ColumnsDescription getActualTableStructure(ContextPtr context) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; - std::optional configuration; + std::optional configuration; }; } From 9a495cbf997d4c0c4ff00b9c75f958b5fa7292d6 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Tue, 23 May 2023 15:31:50 +0800 Subject: [PATCH 0264/1072] Push down filter into Redis --- src/Dictionaries/RedisDictionarySource.cpp | 43 +----- src/Dictionaries/RedisSource.cpp | 52 +++++++- src/Dictionaries/RedisSource.h | 10 ++ src/Storages/RedisCommon.cpp | 68 +++++++--- src/Storages/RedisCommon.h | 28 +++- src/Storages/StorageRedis.cpp | 144 ++++++++++++++++++--- src/Storages/StorageRedis.h | 11 +- src/TableFunctions/TableFunctionRedis.cpp | 10 +- 8 files changed, 271 insertions(+), 95 deletions(-) diff --git a/src/Dictionaries/RedisDictionarySource.cpp b/src/Dictionaries/RedisDictionarySource.cpp index db27801a38e..f96c9231827 100644 --- a/src/Dictionaries/RedisDictionarySource.cpp +++ b/src/Dictionaries/RedisDictionarySource.cpp @@ -105,21 +105,6 @@ namespace DB RedisDictionarySource::~RedisDictionarySource() = default; - static String storageTypeToKeyType(RedisStorageType type) - { - switch (type) - { - case RedisStorageType::SIMPLE: - return "string"; - case RedisStorageType::HASH_MAP: - return "hash"; - default: - return "none"; - } - - UNREACHABLE(); - } - QueryPipeline RedisDictionarySource::loadAll() { auto connection = getRedisConnection(pool, configuration); @@ -142,33 +127,7 @@ namespace DB if (configuration.storage_type == RedisStorageType::HASH_MAP) { - RedisArray hkeys; - for (const auto & key : keys) - { - RedisCommand command_for_secondary_keys("HKEYS"); - command_for_secondary_keys.addRedisType(key); - - auto secondary_keys = connection->client->execute(command_for_secondary_keys); - - RedisArray primary_with_secondary; - primary_with_secondary.addRedisType(key); - for (const auto & secondary_key : secondary_keys) - { - primary_with_secondary.addRedisType(secondary_key); - /// Do not store more than max_block_size values for one request. - if (primary_with_secondary.size() == REDIS_MAX_BLOCK_SIZE + 1) - { - hkeys.add(primary_with_secondary); - primary_with_secondary.clear(); - primary_with_secondary.addRedisType(key); - } - } - - if (primary_with_secondary.size() > 1) - hkeys.add(primary_with_secondary); - } - - keys = hkeys; + keys = *getRedisHashMapKeys(connection, keys); } return QueryPipeline(std::make_shared( diff --git a/src/Dictionaries/RedisSource.cpp b/src/Dictionaries/RedisSource.cpp index 9abaf7f0ac5..20e0838886c 100644 --- a/src/Dictionaries/RedisSource.cpp +++ b/src/Dictionaries/RedisSource.cpp @@ -30,11 +30,29 @@ namespace DB const RedisStorageType & storage_type_, const DB::Block & sample_block, size_t max_block_size_) + : ISource(sample_block), max_block_size(max_block_size_)// TODO + { + RedisColumnTypes columns_types_; + if (storage_type_ == RedisStorageType::HASH_MAP) + columns_types_ = REDIS_HASH_MAP_COLUMN_TYPES; + else + columns_types_ = REDIS_SIMPLE_COLUMN_TYPES; + RedisSource(std::move(connection_), keys_, storage_type_, sample_block, columns_types_, max_block_size_); + } + + RedisSource::RedisSource( + RedisConnectionPtr connection_, + const RedisArray & keys_, + const RedisStorageType & storage_type_, + const DB::Block & sample_block, + const RedisColumnTypes & columns_types_, + size_t max_block_size_) : ISource(sample_block) , connection(std::move(connection_)) , keys(keys_) , storage_type(storage_type_) , max_block_size{max_block_size_} + , columns_types(columns_types_) { description.init(sample_block); } @@ -173,15 +191,27 @@ namespace DB const auto & primary_key = keys_array.get(0); for (size_t i = 0; i < values.size(); ++i) { - const auto & secondary_key = keys_array.get(i + 1); const auto & value = values.get(i); + const auto & secondary_key = keys_array.get(i + 1); /// null string means 'no value for requested key' if (!value.isNull()) { - insert_value_by_idx(0, primary_key); - insert_value_by_idx(1, secondary_key); - insert_value_by_idx(2, value); + for (size_t idx=0; idxreturnObject(std::move(client)); } -String toString(RedisStorageType storage_type) +String storageTypeToKeyType(RedisStorageType storage_type) { - static const std::unordered_map type_to_str_map - = {{RedisStorageType::SIMPLE, "simple"}, {RedisStorageType::HASH_MAP, "hash_map"}}; - - auto iter = type_to_str_map.find(storage_type); - return iter->second; + switch (storage_type) + { + case RedisStorageType::SIMPLE: + return "string"; + case RedisStorageType::HASH_MAP: + return "hash"; + default: + return "none"; + } } -RedisStorageType toRedisStorageType(const String & storage_type) +RedisStorageType keyTypeToStorageType(const String & key_type) { - static const std::unordered_map str_to_type_map - = {{"simple", RedisStorageType::SIMPLE}, {"hash", RedisStorageType::HASH_MAP}}; - - auto iter = str_to_type_map.find(storage_type); - if (iter == str_to_type_map.end()) - { - throw Exception(ErrorCodes::INVALID_REDIS_STORAGE_TYPE, "invalid redis storage type: {}", storage_type); - } - return iter->second; + if (key_type == "string") + return RedisStorageType::SIMPLE; + else if (key_type == "hash") + return RedisStorageType::HASH_MAP; + else + return RedisStorageType::UNKNOWN; } RedisConnectionPtr getRedisConnection(RedisPoolPtr pool, const RedisConfiguration & configuration) @@ -95,4 +99,36 @@ RedisConnectionPtr getRedisConnection(RedisPoolPtr pool, const RedisConfiguratio return std::make_unique(pool, std::move(client)); } + +RedisArrayPtr getRedisHashMapKeys(const RedisConnectionPtr & connection, RedisArray & keys) +{ + RedisArrayPtr hkeys = std::make_shared(); + for (const auto & key : keys) + { + RedisCommand command_for_secondary_keys("HKEYS"); + command_for_secondary_keys.addRedisType(key); + + auto secondary_keys = connection->client->execute(command_for_secondary_keys); + + RedisArray primary_with_secondary; + primary_with_secondary.addRedisType(key); + for (const auto & secondary_key : secondary_keys) + { + primary_with_secondary.addRedisType(secondary_key); + /// Do not store more than max_block_size values for one request. + if (primary_with_secondary.size() == REDIS_MAX_BLOCK_SIZE + 1) + { + hkeys->add(primary_with_secondary); + primary_with_secondary.clear(); + primary_with_secondary.addRedisType(key); + } + } + + if (primary_with_secondary.size() > 1) + hkeys->add(primary_with_secondary); + } + + return hkeys; +} + } diff --git a/src/Storages/RedisCommon.h b/src/Storages/RedisCommon.h index 6069d3d9a0c..590ea1476c4 100644 --- a/src/Storages/RedisCommon.h +++ b/src/Storages/RedisCommon.h @@ -24,8 +24,23 @@ enum class RedisStorageType UNKNOWN }; -String toString(RedisStorageType storage_type); -RedisStorageType toRedisStorageType(const String & storage_type); +enum class RedisColumnType +{ + /// Redis key + KEY, + /// Redis map field + FIELD, + /// Redis value + VALUE +}; + +using RedisColumnTypes = std::vector; + +extern RedisColumnTypes REDIS_HASH_MAP_COLUMN_TYPES; +extern RedisColumnTypes REDIS_SIMPLE_COLUMN_TYPES; + +String storageTypeToKeyType(RedisStorageType storage_type); +RedisStorageType keyTypeToStorageType(const String & key_type); struct RedisConfiguration { @@ -34,10 +49,13 @@ struct RedisConfiguration uint32_t db_index; String password; RedisStorageType storage_type; + /// column name of redis key + String key;// TODO remove uint32_t pool_size; }; using RedisArray = Poco::Redis::Array; +using RedisArrayPtr = std::shared_ptr; using RedisCommand = Poco::Redis::Command; using RedisBulkString = Poco::Redis::BulkString; @@ -56,6 +74,10 @@ struct RedisConnection using RedisConnectionPtr = std::unique_ptr; -RedisConnectionPtr getRedisConnection(RedisPoolPtr pool, const RedisConfiguration & configuration) ; +RedisConnectionPtr getRedisConnection(RedisPoolPtr pool, const RedisConfiguration & configuration); + +///get all redis hash key array +/// eg: keys -> [key1, key2] and get [[key1, field1, field2], [key2, field1, field2]] +RedisArrayPtr getRedisHashMapKeys(const RedisConnectionPtr & connection, RedisArray & keys); } diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index 055617b6a96..45ebe0696d6 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -11,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -22,9 +24,33 @@ namespace DB namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int INVALID_REDIS_STORAGE_TYPE; extern const int NOT_IMPLEMENTED; } +namespace +{ + RedisColumnType getRedisColumnType(RedisStorageType storage_type, const Names & all_columns, const String & column) + { + String redis_col_key = all_columns.at(0); + if (column == redis_col_key) + return RedisColumnType::KEY; + + if (storage_type == RedisStorageType::HASH_MAP) + { + String redis_col_field = all_columns.at(1); + if (column == redis_col_field) + return RedisColumnType::FIELD; + else + return RedisColumnType::VALUE; + } + else + { + return RedisColumnType::VALUE; + } + } +} + StorageRedis::StorageRedis( const StorageID & table_id_, const RedisConfiguration & configuration_, @@ -34,41 +60,120 @@ StorageRedis::StorageRedis( : IStorage(table_id_) , table_id(table_id_) , configuration(configuration_) - , columns(columns_) - , constraints(constraints_) - , comment(comment_) + , log(&Poco::Logger::get("StorageRedis")) { pool = std::make_shared(configuration.pool_size); + StorageInMemoryMetadata storage_metadata; + storage_metadata.setColumns(columns_); + storage_metadata.setConstraints(constraints_); + storage_metadata.setComment(comment_); + setInMemoryMetadata(storage_metadata); } Pipe StorageRedis::read( const Names & column_names, const StorageSnapshotPtr & storage_snapshot, - SelectQueryInfo & /*query_info*/, - ContextPtr /*context*/, + SelectQueryInfo & query_info, + ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, - size_t /*num_streams*/) + size_t num_streams) { + LOG_INFO(log, "num_streams {}", num_streams);// TODO delete auto connection = getRedisConnection(pool, configuration); storage_snapshot->check(column_names); Block sample_block; + RedisColumnTypes redis_types; + auto all_columns = storage_snapshot->metadata->getColumns().getNamesOfPhysical(); + for (const String & column_name : column_names) { auto column_data = storage_snapshot->metadata->getColumns().getPhysical(column_name); sample_block.insert({column_data.type, column_data.name}); + redis_types.push_back(getRedisColumnType(configuration.storage_type, all_columns, column_name)); + LOG_INFO(log, "Request column: {}, Redis type: {}", column_data.name, *redis_types.crbegin()); // TODO delete } - RedisArray keys; - RedisCommand command_for_keys("KEYS"); - /// generate keys by table name prefix - command_for_keys << table_id.getTableName() + ":" + toString(configuration.storage_type) + ":*"; + FieldVectorPtr fields; + bool all_scan = false; - /// Get only keys for specified storage type. - auto all_keys = connection->client->execute(command_for_keys); - return Pipe(std::make_shared(std::move(connection), all_keys, configuration.storage_type, sample_block, max_block_size)); + String primary_key = all_columns.at(0); + auto primary_key_data_type = sample_block.getByName(primary_key).type; + + std::tie(fields, all_scan) = getFilterKeys(primary_key, primary_key_data_type, query_info, context); + + /// TODO hash_map hgetall + if (all_scan) + { + RedisCommand command_for_keys("KEYS"); + /// generate keys by table name prefix + command_for_keys << table_id.getTableName() + ":" + toString(configuration.storage_type) + ":*"; + + auto all_keys = connection->client->execute(command_for_keys); + + if (all_keys.size() == 0) + return {}; + + Pipes pipes; + + size_t num_keys = all_keys.size(); + size_t num_threads = std::min(num_streams, all_keys.size()); + + assert(num_keys <= std::numeric_limits::max()); + + for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) + { + size_t begin = num_keys * thread_idx / num_threads; + size_t end = num_keys * (thread_idx + 1) / num_threads; + + RedisArray keys; + for (size_t pos=begin; pos(pos)); + + if (configuration.storage_type == RedisStorageType::HASH_MAP) + { + keys = *getRedisHashMapKeys(connection, keys); + } + + /// TODO reduce keys copy + pipes.emplace_back(std::make_shared( + std::move(connection), keys, configuration.storage_type, sample_block, redis_types, max_block_size)); + } + return Pipe::unitePipes(std::move(pipes)); + } + else + { + if (fields->empty()) + return {}; + + Pipes pipes; + + size_t num_keys = fields->size(); + size_t num_threads = std::min(num_streams, fields->size()); + + assert(num_keys <= std::numeric_limits::max()); + + for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) + { + size_t begin = num_keys * thread_idx / num_threads; + size_t end = num_keys * (thread_idx + 1) / num_threads; + + RedisArray keys; + for (size_t pos=begin; posat(pos).get()); + + if (configuration.storage_type == RedisStorageType::HASH_MAP) + { + keys = *getRedisHashMapKeys(connection, keys); + } + + pipes.emplace_back(std::make_shared( + std::move(connection), keys, configuration.storage_type, sample_block, redis_types, max_block_size)); + } + return Pipe::unitePipes(std::move(pipes)); + } } @@ -88,15 +193,15 @@ RedisConfiguration StorageRedis::getConfiguration(ASTs engine_args, ContextPtr c { validateNamedCollection( *named_collection, - ValidateKeysMultiset{"host", "port", "hostname", "password", "db_index", "storage_type"}, + ValidateKeysMultiset{"host", "port", "hostname", "password", "db_index", "storage_type", "pool_size"}, {}); configuration.host = named_collection->getAny({"host", "hostname"}); configuration.port = static_cast(named_collection->get("port")); configuration.password = named_collection->get("password"); configuration.db_index = static_cast(named_collection->get({"db_index"})); - configuration.storage_type = toRedisStorageType(named_collection->getOrDefault("storage_type", "")); - configuration.pool_size = 16; /// TODO + configuration.storage_type = keyTypeToStorageType(named_collection->getOrDefault("storage_type", "")); + configuration.pool_size = static_cast(named_collection->get("pool_size")); } else { @@ -110,10 +215,13 @@ RedisConfiguration StorageRedis::getConfiguration(ASTs engine_args, ContextPtr c configuration.port = parsed_host_port.second; configuration.db_index = static_cast(checkAndGetLiteralArgument(engine_args[1], "db_index")); configuration.password = checkAndGetLiteralArgument(engine_args[2], "password"); - configuration.storage_type = toRedisStorageType(checkAndGetLiteralArgument(engine_args[3], "storage_type")); - configuration.pool_size = 16; /// TODO + configuration.storage_type = keyTypeToStorageType(checkAndGetLiteralArgument(engine_args[3], "storage_type")); + configuration.pool_size = static_cast(checkAndGetLiteralArgument(engine_args[4], "pool_size")); } + if (configuration.storage_type == RedisStorageType::UNKNOWN) + throw Exception(ErrorCodes::INVALID_REDIS_STORAGE_TYPE, "Invalid Redis storage type"); + context->getRemoteHostFilter().checkHostAndPort(configuration.host, toString(configuration.port)); return configuration; } diff --git a/src/Storages/StorageRedis.h b/src/Storages/StorageRedis.h index 60db75dd384..1bffc6a64bf 100644 --- a/src/Storages/StorageRedis.h +++ b/src/Storages/StorageRedis.h @@ -7,8 +7,12 @@ namespace DB { /* Implements storage in the Redis. - * Use ENGINE = Redis(host:port, db_index, password, storage_type); + * Use ENGINE = Redis(host:port, db_index, password, storage_type, conn_pool_size); * Read only. + * + * Note If storage_type is + * simple: there should be 2 columns and the first one is key in Redis, the second one is value. + * hash_map: there should be 3 columns and the first one is key in Redis and the second is the field of Redis Map. */ class StorageRedis : public IStorage { @@ -42,10 +46,7 @@ private: StorageID table_id; RedisConfiguration configuration; - ColumnsDescription columns; - ConstraintsDescription constraints; - - String comment; + Poco::Logger * log; RedisPoolPtr pool; }; diff --git a/src/TableFunctions/TableFunctionRedis.cpp b/src/TableFunctions/TableFunctionRedis.cpp index db612806652..e410ad799a6 100644 --- a/src/TableFunctions/TableFunctionRedis.cpp +++ b/src/TableFunctions/TableFunctionRedis.cpp @@ -49,10 +49,10 @@ ColumnsDescription TableFunctionRedis::getActualTableStructure(ContextPtr contex structure = "key String, value String"; break; case RedisStorageType::HASH_MAP: - structure = "key String, field, String, value String"; + structure = "key String, field String, value String"; break; case RedisStorageType::UNKNOWN: - throw Exception(ErrorCodes::INVALID_REDIS_STORAGE_TYPE, "invalid redis storage type."); + throw Exception(ErrorCodes::INVALID_REDIS_STORAGE_TYPE, "Invalid Redis storage type."); } return parseColumnsListFromString(structure, context); } @@ -65,12 +65,12 @@ void TableFunctionRedis::parseArguments(const ASTPtr & ast_function, ContextPtr ASTs & args = func_args.arguments->children; - if (args.size() != 4) + if (args.size() != 5) { throw Exception( ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Table function 'Redis' requires from 4 parameters: " - "redis('host:port', db_index, 'password', 'storage_type')"); + "Table function 'Redis' requires from 5 parameters: " + "redis('host:port', db_index, 'password', 'storage_type', 'pool_size')"); } configuration = StorageRedis::getConfiguration(args, context); } From 23d6c835d831b2fa9650c693370219ee1d1f9727 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Tue, 23 May 2023 16:42:46 +0800 Subject: [PATCH 0265/1072] fix poco redis array NPE --- src/Storages/RedisCommon.cpp | 22 ++++++++++++++++++++++ src/Storages/RedisCommon.h | 7 ++++++- src/Storages/StorageRedis.cpp | 35 ++--------------------------------- src/Storages/StorageRedis.h | 4 ++-- 4 files changed, 32 insertions(+), 36 deletions(-) diff --git a/src/Storages/RedisCommon.cpp b/src/Storages/RedisCommon.cpp index b910759fe52..0a13e40b1ec 100644 --- a/src/Storages/RedisCommon.cpp +++ b/src/Storages/RedisCommon.cpp @@ -109,6 +109,8 @@ RedisArrayPtr getRedisHashMapKeys(const RedisConnectionPtr & connection, RedisAr command_for_secondary_keys.addRedisType(key); auto secondary_keys = connection->client->execute(command_for_secondary_keys); + if (secondary_keys.isNull()) + continue; RedisArray primary_with_secondary; primary_with_secondary.addRedisType(key); @@ -131,4 +133,24 @@ RedisArrayPtr getRedisHashMapKeys(const RedisConnectionPtr & connection, RedisAr return hkeys; } +RedisColumnType getRedisColumnType(RedisStorageType storage_type, const Names & all_columns, const String & column) +{ + String redis_col_key = all_columns.at(0); + if (column == redis_col_key) + return RedisColumnType::KEY; + + if (storage_type == RedisStorageType::HASH_MAP) + { + String redis_col_field = all_columns.at(1); + if (column == redis_col_field) + return RedisColumnType::FIELD; + else + return RedisColumnType::VALUE; + } + else + { + return RedisColumnType::VALUE; + } +} + } diff --git a/src/Storages/RedisCommon.h b/src/Storages/RedisCommon.h index 590ea1476c4..384a02d76e4 100644 --- a/src/Storages/RedisCommon.h +++ b/src/Storages/RedisCommon.h @@ -6,6 +6,7 @@ #include #include +#include namespace DB { @@ -28,7 +29,7 @@ enum class RedisColumnType { /// Redis key KEY, - /// Redis map field + /// Redis hash field FIELD, /// Redis value VALUE @@ -80,4 +81,8 @@ RedisConnectionPtr getRedisConnection(RedisPoolPtr pool, const RedisConfiguratio /// eg: keys -> [key1, key2] and get [[key1, field1, field2], [key2, field1, field2]] RedisArrayPtr getRedisHashMapKeys(const RedisConnectionPtr & connection, RedisArray & keys); +/// Get RedisColumnType of a column, If storage_type is +/// SIMPLE: all_columns must have 2 iterm and the first one is Redis key the second one is value +/// HASH_MAP: all_columns must have 2 iterm and the first one is Redis key the second is field, the third is value. +RedisColumnType getRedisColumnType(RedisStorageType storage_type, const Names & all_columns, const String & column); } diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index 45ebe0696d6..0e1b3a24e4f 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -5,15 +5,11 @@ #include #include -#include -#include -#include #include #include #include #include #include -#include #include #include #include @@ -28,29 +24,6 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } -namespace -{ - RedisColumnType getRedisColumnType(RedisStorageType storage_type, const Names & all_columns, const String & column) - { - String redis_col_key = all_columns.at(0); - if (column == redis_col_key) - return RedisColumnType::KEY; - - if (storage_type == RedisStorageType::HASH_MAP) - { - String redis_col_field = all_columns.at(1); - if (column == redis_col_field) - return RedisColumnType::FIELD; - else - return RedisColumnType::VALUE; - } - else - { - return RedisColumnType::VALUE; - } - } -} - StorageRedis::StorageRedis( const StorageID & table_id_, const RedisConfiguration & configuration_, @@ -79,9 +52,7 @@ Pipe StorageRedis::read( size_t max_block_size, size_t num_streams) { - LOG_INFO(log, "num_streams {}", num_streams);// TODO delete auto connection = getRedisConnection(pool, configuration); - storage_snapshot->check(column_names); Block sample_block; @@ -93,7 +64,6 @@ Pipe StorageRedis::read( auto column_data = storage_snapshot->metadata->getColumns().getPhysical(column_name); sample_block.insert({column_data.type, column_data.name}); redis_types.push_back(getRedisColumnType(configuration.storage_type, all_columns, column_name)); - LOG_INFO(log, "Request column: {}, Redis type: {}", column_data.name, *redis_types.crbegin()); // TODO delete } FieldVectorPtr fields; @@ -104,16 +74,15 @@ Pipe StorageRedis::read( std::tie(fields, all_scan) = getFilterKeys(primary_key, primary_key_data_type, query_info, context); - /// TODO hash_map hgetall if (all_scan) { RedisCommand command_for_keys("KEYS"); /// generate keys by table name prefix - command_for_keys << table_id.getTableName() + ":" + toString(configuration.storage_type) + ":*"; + command_for_keys << table_id.getTableName() + ":" + storageTypeToKeyType(configuration.storage_type) + ":*"; auto all_keys = connection->client->execute(command_for_keys); - if (all_keys.size() == 0) + if (all_keys.isNull() || all_keys.size() == 0) return {}; Pipes pipes; diff --git a/src/Storages/StorageRedis.h b/src/Storages/StorageRedis.h index 1bffc6a64bf..2c6c6193982 100644 --- a/src/Storages/StorageRedis.h +++ b/src/Storages/StorageRedis.h @@ -11,8 +11,8 @@ namespace DB * Read only. * * Note If storage_type is - * simple: there should be 2 columns and the first one is key in Redis, the second one is value. - * hash_map: there should be 3 columns and the first one is key in Redis and the second is the field of Redis Map. + * SIMPLE: there should be 2 columns and the first one is key in Redis, the second one is value. + * HASH_MAP: there should be 3 columns and the first one is key in Redis and the second is the field of Redis Map. */ class StorageRedis : public IStorage { From ce203b5ce6cb2329a7a26bcb4999e040ecbdbf7d Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Tue, 23 May 2023 20:54:26 +0800 Subject: [PATCH 0266/1072] Check redis table structure --- src/Common/ErrorCodes.cpp | 1 + src/Dictionaries/RedisSource.cpp | 5 +---- src/Dictionaries/RedisSource.h | 1 - src/Storages/RedisCommon.cpp | 13 +++++++++++++ src/Storages/RedisCommon.h | 7 +++++-- src/Storages/StorageRedis.cpp | 20 ++++++++++++++++---- src/TableFunctions/TableFunctionRedis.cpp | 6 ++++-- 7 files changed, 40 insertions(+), 13 deletions(-) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 505cf0aac8f..4c08d762df2 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -581,6 +581,7 @@ M(696, ASYNC_LOAD_CANCELED) \ M(697, CANNOT_RESTORE_TO_NONENCRYPTED_DISK) \ M(698, INVALID_REDIS_STORAGE_TYPE) \ + M(699, INVALID_REDIS_TABLE_STRUCTURE) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Dictionaries/RedisSource.cpp b/src/Dictionaries/RedisSource.cpp index 20e0838886c..27125077c10 100644 --- a/src/Dictionaries/RedisSource.cpp +++ b/src/Dictionaries/RedisSource.cpp @@ -1,15 +1,12 @@ #include "RedisSource.h" -#include #include - #include #include #include #include #include - -#include "DictionaryStructure.h" +#include namespace DB diff --git a/src/Dictionaries/RedisSource.h b/src/Dictionaries/RedisSource.h index fe5a973d57c..e8e78db67bc 100644 --- a/src/Dictionaries/RedisSource.h +++ b/src/Dictionaries/RedisSource.h @@ -7,7 +7,6 @@ #include #include #include -#include "RedisDictionarySource.h" namespace DB diff --git a/src/Storages/RedisCommon.cpp b/src/Storages/RedisCommon.cpp index 0a13e40b1ec..916ac3b69bc 100644 --- a/src/Storages/RedisCommon.cpp +++ b/src/Storages/RedisCommon.cpp @@ -7,6 +7,7 @@ namespace DB namespace ErrorCodes { extern const int INVALID_REDIS_STORAGE_TYPE; + extern const int INVALID_REDIS_TABLE_STRUCTURE; extern const int INTERNAL_REDIS_ERROR; extern const int TIMEOUT_EXCEEDED; } @@ -153,4 +154,16 @@ RedisColumnType getRedisColumnType(RedisStorageType storage_type, const Names & } } +void checkRedisTableStructure(const ColumnsDescription & columns, const RedisConfiguration & configuration) +{ + /// TODO check data type + if (configuration.storage_type == RedisStorageType::HASH_MAP && columns.size() != 3) + throw Exception(ErrorCodes::INVALID_REDIS_TABLE_STRUCTURE, + "Redis hash table must have 3 columns, but found {}", columns.size()); + + if (configuration.storage_type == RedisStorageType::SIMPLE && columns.size() != 2) + throw Exception(ErrorCodes::INVALID_REDIS_TABLE_STRUCTURE, + "Redis string table must have 2 columns, but found {}", columns.size()); +} + } diff --git a/src/Storages/RedisCommon.h b/src/Storages/RedisCommon.h index 384a02d76e4..e663faa5fab 100644 --- a/src/Storages/RedisCommon.h +++ b/src/Storages/RedisCommon.h @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB { @@ -50,8 +51,6 @@ struct RedisConfiguration uint32_t db_index; String password; RedisStorageType storage_type; - /// column name of redis key - String key;// TODO remove uint32_t pool_size; }; @@ -85,4 +84,8 @@ RedisArrayPtr getRedisHashMapKeys(const RedisConnectionPtr & connection, RedisAr /// SIMPLE: all_columns must have 2 iterm and the first one is Redis key the second one is value /// HASH_MAP: all_columns must have 2 iterm and the first one is Redis key the second is field, the third is value. RedisColumnType getRedisColumnType(RedisStorageType storage_type, const Names & all_columns, const String & column); + +/// checking Redis table/table-function when creating +void checkRedisTableStructure(const ColumnsDescription & columns, const RedisConfiguration & configuration); + } diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index 0e1b3a24e4f..7721665e9dd 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -78,7 +78,8 @@ Pipe StorageRedis::read( { RedisCommand command_for_keys("KEYS"); /// generate keys by table name prefix - command_for_keys << table_id.getTableName() + ":" + storageTypeToKeyType(configuration.storage_type) + ":*"; +// command_for_keys << table_id.getTableName() + ":" + storageTypeToKeyType(configuration.storage_type) + ":*"; + command_for_keys << "*"; auto all_keys = connection->client->execute(command_for_keys); @@ -90,6 +91,7 @@ Pipe StorageRedis::read( size_t num_keys = all_keys.size(); size_t num_threads = std::min(num_streams, all_keys.size()); + num_threads = std::min(num_threads, configuration.pool_size); assert(num_keys <= std::numeric_limits::max()); for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) @@ -106,9 +108,12 @@ Pipe StorageRedis::read( keys = *getRedisHashMapKeys(connection, keys); } + delete connection.release(); + /// TODO reduce keys copy pipes.emplace_back(std::make_shared( - std::move(connection), keys, configuration.storage_type, sample_block, redis_types, max_block_size)); + getRedisConnection(pool, configuration), keys, + configuration.storage_type, sample_block, redis_types, max_block_size)); } return Pipe::unitePipes(std::move(pipes)); } @@ -122,6 +127,7 @@ Pipe StorageRedis::read( size_t num_keys = fields->size(); size_t num_threads = std::min(num_streams, fields->size()); + num_threads = std::min(num_threads, configuration.pool_size); assert(num_keys <= std::numeric_limits::max()); for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) @@ -138,8 +144,11 @@ Pipe StorageRedis::read( keys = *getRedisHashMapKeys(connection, keys); } + delete connection.release(); + pipes.emplace_back(std::make_shared( - std::move(connection), keys, configuration.storage_type, sample_block, redis_types, max_block_size)); + getRedisConnection(pool, configuration), keys, + configuration.storage_type, sample_block, redis_types, max_block_size)); } return Pipe::unitePipes(std::move(pipes)); } @@ -151,9 +160,10 @@ SinkToStoragePtr StorageRedis::write( const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr /*context*/) { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method write is unsupported for StorageRedis"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Write is unsupported for StorageRedis"); } +/// TODO make "password", "db_index", "storage_type", "pool_size" optional RedisConfiguration StorageRedis::getConfiguration(ASTs engine_args, ContextPtr context) { RedisConfiguration configuration; @@ -203,6 +213,8 @@ void registerStorageRedis(StorageFactory & factory) { auto configuration = StorageRedis::getConfiguration(args.engine_args, args.getLocalContext()); + checkRedisTableStructure(args.columns, configuration); + return std::make_shared( args.table_id, configuration, diff --git a/src/TableFunctions/TableFunctionRedis.cpp b/src/TableFunctions/TableFunctionRedis.cpp index e410ad799a6..9e4a39b1b85 100644 --- a/src/TableFunctions/TableFunctionRedis.cpp +++ b/src/TableFunctions/TableFunctionRedis.cpp @@ -1,7 +1,6 @@ #include #include -#include #include @@ -29,8 +28,10 @@ StoragePtr TableFunctionRedis::executeImpl( const ASTPtr & /*ast_function*/, ContextPtr context, const String & table_name, ColumnsDescription /*cached_columns*/) const { auto columns = getActualTableStructure(context); + checkRedisTableStructure(columns, *configuration); + auto storage = std::make_shared( - StorageID(toString(configuration->db_index), table_name), // TODO + StorageID(toString(configuration->db_index), table_name), *configuration, columns, ConstraintsDescription(), @@ -39,6 +40,7 @@ StoragePtr TableFunctionRedis::executeImpl( return storage; } +/// TODO support user customized table structure ColumnsDescription TableFunctionRedis::getActualTableStructure(ContextPtr context) const { /// generate table structure by storage type. From 40cc8d210792cd7a3e7f80f9ed9bf95fe938e9d8 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Wed, 24 May 2023 10:34:37 +0800 Subject: [PATCH 0267/1072] fix code style --- src/Dictionaries/RedisDictionarySource.cpp | 2 -- src/Storages/RedisCommon.cpp | 1 - src/Storages/RedisCommon.h | 5 ----- src/Storages/StorageRedis.cpp | 1 - 4 files changed, 9 deletions(-) diff --git a/src/Dictionaries/RedisDictionarySource.cpp b/src/Dictionaries/RedisDictionarySource.cpp index f96c9231827..1056383bc84 100644 --- a/src/Dictionaries/RedisDictionarySource.cpp +++ b/src/Dictionaries/RedisDictionarySource.cpp @@ -17,9 +17,7 @@ namespace DB { extern const int UNSUPPORTED_METHOD; extern const int INVALID_CONFIG_PARAMETER; - extern const int INTERNAL_REDIS_ERROR; extern const int LOGICAL_ERROR; - extern const int TIMEOUT_EXCEEDED; } static RedisStorageType parseStorageType(const String & storage_type_str) diff --git a/src/Storages/RedisCommon.cpp b/src/Storages/RedisCommon.cpp index 916ac3b69bc..63a8d911bf0 100644 --- a/src/Storages/RedisCommon.cpp +++ b/src/Storages/RedisCommon.cpp @@ -6,7 +6,6 @@ namespace DB namespace ErrorCodes { - extern const int INVALID_REDIS_STORAGE_TYPE; extern const int INVALID_REDIS_TABLE_STRUCTURE; extern const int INTERNAL_REDIS_ERROR; extern const int TIMEOUT_EXCEEDED; diff --git a/src/Storages/RedisCommon.h b/src/Storages/RedisCommon.h index e663faa5fab..2668311125f 100644 --- a/src/Storages/RedisCommon.h +++ b/src/Storages/RedisCommon.h @@ -11,11 +11,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int NOT_IMPLEMENTED; -} - static constexpr size_t REDIS_MAX_BLOCK_SIZE = DEFAULT_BLOCK_SIZE; static constexpr size_t REDIS_LOCK_ACQUIRE_TIMEOUT_MS = 5000; diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index 7721665e9dd..3d7721bdc0e 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -19,7 +19,6 @@ namespace DB namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int INVALID_REDIS_STORAGE_TYPE; extern const int NOT_IMPLEMENTED; } From 70cfd7a222342bfd1d9a3ddb59eaa2391d014d28 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Wed, 24 May 2023 10:45:21 +0800 Subject: [PATCH 0268/1072] fix typos --- src/Storages/RedisCommon.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/RedisCommon.h b/src/Storages/RedisCommon.h index 2668311125f..348c2494632 100644 --- a/src/Storages/RedisCommon.h +++ b/src/Storages/RedisCommon.h @@ -76,8 +76,8 @@ RedisConnectionPtr getRedisConnection(RedisPoolPtr pool, const RedisConfiguratio RedisArrayPtr getRedisHashMapKeys(const RedisConnectionPtr & connection, RedisArray & keys); /// Get RedisColumnType of a column, If storage_type is -/// SIMPLE: all_columns must have 2 iterm and the first one is Redis key the second one is value -/// HASH_MAP: all_columns must have 2 iterm and the first one is Redis key the second is field, the third is value. +/// SIMPLE: all_columns must have 2 items and the first one is Redis key the second one is value +/// HASH_MAP: all_columns must have 2 items and the first one is Redis key the second is field, the third is value. RedisColumnType getRedisColumnType(RedisStorageType storage_type, const Names & all_columns, const String & column); /// checking Redis table/table-function when creating From d594bb1c7a3d6f0fe2ef4e27bb01e132ed82a8e7 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Wed, 24 May 2023 10:53:42 +0800 Subject: [PATCH 0269/1072] fix fast tests --- src/Access/Common/AccessType.h | 2 +- tests/queries/0_stateless/01271_show_privileges.reference | 2 +- .../0_stateless/02117_show_create_table_system.reference | 2 +- .../02414_all_new_table_functions_must_be_documented.reference | 1 + 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index 78c341cdcb5..c9cce610f2c 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -201,7 +201,7 @@ enum class AccessType M(URL, "", GLOBAL, SOURCES) \ M(REMOTE, "", GLOBAL, SOURCES) \ M(MONGO, "", GLOBAL, SOURCES) \ - M(Redis, "", GLOBAL, SOURCES) \ + M(REDIS, "", GLOBAL, SOURCES) \ M(MEILISEARCH, "", GLOBAL, SOURCES) \ M(MYSQL, "", GLOBAL, SOURCES) \ M(POSTGRES, "", GLOBAL, SOURCES) \ diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index ec245d8b9e0..5ada21e31f4 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -148,7 +148,7 @@ INTROSPECTION ['INTROSPECTION FUNCTIONS'] \N ALL FILE [] GLOBAL SOURCES URL [] GLOBAL SOURCES REMOTE [] GLOBAL SOURCES -MONGO [] GLOBAL SOURCES +REDIS [] GLOBAL SOURCES MEILISEARCH [] GLOBAL SOURCES MYSQL [] GLOBAL SOURCES POSTGRES [] GLOBAL SOURCES diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index 09cc62dac00..724118f7bc1 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -297,7 +297,7 @@ CREATE TABLE system.grants ( `user_name` Nullable(String), `role_name` Nullable(String), - `access_type` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION CONTROL' = 96, 'SYSTEM SHUTDOWN' = 97, 'SYSTEM DROP DNS CACHE' = 98, 'SYSTEM DROP MARK CACHE' = 99, 'SYSTEM DROP UNCOMPRESSED CACHE' = 100, 'SYSTEM DROP MMAP CACHE' = 101, 'SYSTEM DROP QUERY CACHE' = 102, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 103, 'SYSTEM DROP FILESYSTEM CACHE' = 104, 'SYSTEM DROP SCHEMA CACHE' = 105, 'SYSTEM DROP S3 CLIENT CACHE' = 106, 'SYSTEM DROP CACHE' = 107, 'SYSTEM RELOAD CONFIG' = 108, 'SYSTEM RELOAD USERS' = 109, 'SYSTEM RELOAD SYMBOLS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH' = 135, 'SYSTEM THREAD FUZZER' = 136, 'SYSTEM UNFREEZE' = 137, 'SYSTEM FAILPOINT' = 138, 'SYSTEM' = 139, 'dictGet' = 140, 'displaySecretsInShowAndSelect' = 141, 'addressToLine' = 142, 'addressToLineWithInlines' = 143, 'addressToSymbol' = 144, 'demangle' = 145, 'INTROSPECTION' = 146, 'FILE' = 147, 'URL' = 148, 'REMOTE' = 149, 'MONGO' = 150, 'MEILISEARCH' = 151, 'MYSQL' = 152, 'POSTGRES' = 153, 'SQLITE' = 154, 'ODBC' = 155, 'JDBC' = 156, 'HDFS' = 157, 'S3' = 158, 'HIVE' = 159, 'SOURCES' = 160, 'CLUSTER' = 161, 'ALL' = 162, 'NONE' = 163), + `access_type` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION CONTROL' = 96, 'SYSTEM SHUTDOWN' = 97, 'SYSTEM DROP DNS CACHE' = 98, 'SYSTEM DROP MARK CACHE' = 99, 'SYSTEM DROP UNCOMPRESSED CACHE' = 100, 'SYSTEM DROP MMAP CACHE' = 101, 'SYSTEM DROP QUERY CACHE' = 102, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 103, 'SYSTEM DROP FILESYSTEM CACHE' = 104, 'SYSTEM DROP SCHEMA CACHE' = 105, 'SYSTEM DROP S3 CLIENT CACHE' = 106, 'SYSTEM DROP CACHE' = 107, 'SYSTEM RELOAD CONFIG' = 108, 'SYSTEM RELOAD USERS' = 109, 'SYSTEM RELOAD SYMBOLS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH' = 135, 'SYSTEM THREAD FUZZER' = 136, 'SYSTEM UNFREEZE' = 137, 'SYSTEM FAILPOINT' = 138, 'SYSTEM' = 139, 'dictGet' = 140, 'displaySecretsInShowAndSelect' = 141, 'addressToLine' = 142, 'addressToLineWithInlines' = 143, 'addressToSymbol' = 144, 'demangle' = 145, 'INTROSPECTION' = 146, 'FILE' = 147, 'URL' = 148, 'REMOTE' = 149, 'MONGO' = 150, 'Redis' = 151, 'MEILISEARCH' = 152, 'MYSQL' = 153, 'POSTGRES' = 154, 'SQLITE' = 155, 'ODBC' = 156, 'JDBC' = 157, 'HDFS' = 158, 'S3' = 159, 'HIVE' = 160, 'SOURCES' = 161, 'CLUSTER' = 162, 'ALL' = 163, 'NONE' = 164), `database` Nullable(String), `table` Nullable(String), `column` Nullable(String), diff --git a/tests/queries/0_stateless/02414_all_new_table_functions_must_be_documented.reference b/tests/queries/0_stateless/02414_all_new_table_functions_must_be_documented.reference index 2277e19cf25..4f16e57d606 100644 --- a/tests/queries/0_stateless/02414_all_new_table_functions_must_be_documented.reference +++ b/tests/queries/0_stateless/02414_all_new_table_functions_must_be_documented.reference @@ -9,6 +9,7 @@ jdbc meilisearch merge mongodb +redis null numbers numbers_mt From b35867d907d39b790c9f8b3feb709cb9e76a6434 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Wed, 24 May 2023 18:06:42 +0800 Subject: [PATCH 0270/1072] unify storage type --- src/Dictionaries/RedisDictionarySource.cpp | 10 - src/Dictionaries/RedisDictionarySource.h | 10 - src/Storages/RedisCommon.cpp | 19 +- src/Storages/RedisCommon.h | 5 +- src/Storages/StorageRedis.cpp | 8 +- .../test_storage_redis/__init__.py | 0 .../configs/named_collections.xml | 12 + .../configs_secure/config.d/ssl_conf.xml | 8 + tests/integration/test_storage_redis/test.py | 426 ++++++++++++++++++ 9 files changed, 464 insertions(+), 34 deletions(-) create mode 100644 tests/integration/test_storage_redis/__init__.py create mode 100644 tests/integration/test_storage_redis/configs/named_collections.xml create mode 100644 tests/integration/test_storage_redis/configs_secure/config.d/ssl_conf.xml create mode 100644 tests/integration/test_storage_redis/test.py diff --git a/src/Dictionaries/RedisDictionarySource.cpp b/src/Dictionaries/RedisDictionarySource.cpp index 1056383bc84..d28b7528d23 100644 --- a/src/Dictionaries/RedisDictionarySource.cpp +++ b/src/Dictionaries/RedisDictionarySource.cpp @@ -20,16 +20,6 @@ namespace DB extern const int LOGICAL_ERROR; } - static RedisStorageType parseStorageType(const String & storage_type_str) - { - if (storage_type_str == "hash_map") - return RedisStorageType::HASH_MAP; - else if (!storage_type_str.empty() && storage_type_str != "simple") - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Unknown storage type {} for Redis dictionary", storage_type_str); - - return RedisStorageType::SIMPLE; - } - void registerDictionarySourceRedis(DictionarySourceFactory & factory) { auto create_table_source = [=](const DictionaryStructure & dict_struct, diff --git a/src/Dictionaries/RedisDictionarySource.h b/src/Dictionaries/RedisDictionarySource.h index c7786284dc4..a55f220321d 100644 --- a/src/Dictionaries/RedisDictionarySource.h +++ b/src/Dictionaries/RedisDictionarySource.h @@ -7,16 +7,6 @@ #include "IDictionarySource.h" #include -namespace Poco -{ - namespace Redis - { - class Client; - class Array; - class Command; - } -} - namespace DB { namespace ErrorCodes diff --git a/src/Storages/RedisCommon.cpp b/src/Storages/RedisCommon.cpp index 63a8d911bf0..8cc94c45dae 100644 --- a/src/Storages/RedisCommon.cpp +++ b/src/Storages/RedisCommon.cpp @@ -9,6 +9,7 @@ namespace ErrorCodes extern const int INVALID_REDIS_TABLE_STRUCTURE; extern const int INTERNAL_REDIS_ERROR; extern const int TIMEOUT_EXCEEDED; + extern const int INVALID_REDIS_STORAGE_TYPE; } RedisColumnTypes REDIS_HASH_MAP_COLUMN_TYPES = {RedisColumnType::KEY, RedisColumnType::FIELD, RedisColumnType::VALUE}; @@ -24,27 +25,27 @@ RedisConnection::~RedisConnection() pool->returnObject(std::move(client)); } -String storageTypeToKeyType(RedisStorageType storage_type) +String serializeStorageType(RedisStorageType storage_type) { switch (storage_type) { case RedisStorageType::SIMPLE: - return "string"; + return "simple"; case RedisStorageType::HASH_MAP: - return "hash"; + return "hash_map"; default: return "none"; } } -RedisStorageType keyTypeToStorageType(const String & key_type) +RedisStorageType parseStorageType(const String & storage_type_str) { - if (key_type == "string") - return RedisStorageType::SIMPLE; - else if (key_type == "hash") + if (storage_type_str == "hash_map") return RedisStorageType::HASH_MAP; - else - return RedisStorageType::UNKNOWN; + else if (!storage_type_str.empty() && storage_type_str != "simple") + throw Exception(ErrorCodes::INVALID_REDIS_STORAGE_TYPE, "Unknown storage type {} for Redis dictionary", storage_type_str); + + return RedisStorageType::SIMPLE; } RedisConnectionPtr getRedisConnection(RedisPoolPtr pool, const RedisConfiguration & configuration) diff --git a/src/Storages/RedisCommon.h b/src/Storages/RedisCommon.h index 348c2494632..02d0b435b9d 100644 --- a/src/Storages/RedisCommon.h +++ b/src/Storages/RedisCommon.h @@ -36,8 +36,11 @@ using RedisColumnTypes = std::vector; extern RedisColumnTypes REDIS_HASH_MAP_COLUMN_TYPES; extern RedisColumnTypes REDIS_SIMPLE_COLUMN_TYPES; +/// storage type to Redis key type String storageTypeToKeyType(RedisStorageType storage_type); -RedisStorageType keyTypeToStorageType(const String & key_type); + +RedisStorageType parseStorageType(const String & storage_type_str); +String serializeStorageType(RedisStorageType storage_type); struct RedisConfiguration { diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index 3d7721bdc0e..819ab01d733 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -77,7 +77,7 @@ Pipe StorageRedis::read( { RedisCommand command_for_keys("KEYS"); /// generate keys by table name prefix -// command_for_keys << table_id.getTableName() + ":" + storageTypeToKeyType(configuration.storage_type) + ":*"; +// command_for_keys << table_id.getTableName() + ":" + serializeStorageType(configuration.storage_type) + ":*"; command_for_keys << "*"; auto all_keys = connection->client->execute(command_for_keys); @@ -178,7 +178,7 @@ RedisConfiguration StorageRedis::getConfiguration(ASTs engine_args, ContextPtr c configuration.port = static_cast(named_collection->get("port")); configuration.password = named_collection->get("password"); configuration.db_index = static_cast(named_collection->get({"db_index"})); - configuration.storage_type = keyTypeToStorageType(named_collection->getOrDefault("storage_type", "")); + configuration.storage_type = parseStorageType(named_collection->getOrDefault("storage_type", "")); configuration.pool_size = static_cast(named_collection->get("pool_size")); } else @@ -193,7 +193,7 @@ RedisConfiguration StorageRedis::getConfiguration(ASTs engine_args, ContextPtr c configuration.port = parsed_host_port.second; configuration.db_index = static_cast(checkAndGetLiteralArgument(engine_args[1], "db_index")); configuration.password = checkAndGetLiteralArgument(engine_args[2], "password"); - configuration.storage_type = keyTypeToStorageType(checkAndGetLiteralArgument(engine_args[3], "storage_type")); + configuration.storage_type = parseStorageType(checkAndGetLiteralArgument(engine_args[3], "storage_type")); configuration.pool_size = static_cast(checkAndGetLiteralArgument(engine_args[4], "pool_size")); } @@ -222,7 +222,7 @@ void registerStorageRedis(StorageFactory & factory) args.comment); }, { - .source_access_type = AccessType::Redis, + .source_access_type = AccessType::REDIS, }); } diff --git a/tests/integration/test_storage_redis/__init__.py b/tests/integration/test_storage_redis/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_storage_redis/configs/named_collections.xml b/tests/integration/test_storage_redis/configs/named_collections.xml new file mode 100644 index 00000000000..5f7db390982 --- /dev/null +++ b/tests/integration/test_storage_redis/configs/named_collections.xml @@ -0,0 +1,12 @@ + + + + root + clickhouse + mongo1 + 27017 + test + simple_table + + + diff --git a/tests/integration/test_storage_redis/configs_secure/config.d/ssl_conf.xml b/tests/integration/test_storage_redis/configs_secure/config.d/ssl_conf.xml new file mode 100644 index 00000000000..3efe98e7045 --- /dev/null +++ b/tests/integration/test_storage_redis/configs_secure/config.d/ssl_conf.xml @@ -0,0 +1,8 @@ + + + + + none + + + diff --git a/tests/integration/test_storage_redis/test.py b/tests/integration/test_storage_redis/test.py new file mode 100644 index 00000000000..6ba5520704d --- /dev/null +++ b/tests/integration/test_storage_redis/test.py @@ -0,0 +1,426 @@ +import pymongo + +import pytest +from helpers.client import QueryRuntimeException + +from helpers.cluster import ClickHouseCluster +import datetime + + +@pytest.fixture(scope="module") +def started_cluster(request): + try: + cluster = ClickHouseCluster(__file__) + node = cluster.add_instance( + "node", + main_configs=[ + "configs_secure/config.d/ssl_conf.xml", + "configs/named_collections.xml", + ], + with_mongo=True, + with_mongo_secure=request.param, + ) + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def get_mongo_connection(started_cluster, secure=False, with_credentials=True): + connection_str = "" + if with_credentials: + connection_str = "mongodb://root:clickhouse@localhost:{}".format( + started_cluster.mongo_port + ) + else: + connection_str = "mongodb://localhost:{}".format( + started_cluster.mongo_no_cred_port + ) + if secure: + connection_str += "/?tls=true&tlsAllowInvalidCertificates=true" + return pymongo.MongoClient(connection_str) + + +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) +def test_simple_select(started_cluster): + mongo_connection = get_mongo_connection(started_cluster) + db = mongo_connection["test"] + db.add_user("root", "clickhouse") + simple_mongo_table = db["simple_table"] + data = [] + for i in range(0, 100): + data.append({"key": i, "data": hex(i * i)}) + simple_mongo_table.insert_many(data) + + node = started_cluster.instances["node"] + node.query( + "CREATE TABLE simple_mongo_table(key UInt64, data String) ENGINE = MongoDB('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse')" + ) + + assert node.query("SELECT COUNT() FROM simple_mongo_table") == "100\n" + assert ( + node.query("SELECT sum(key) FROM simple_mongo_table") + == str(sum(range(0, 100))) + "\n" + ) + + assert ( + node.query("SELECT data from simple_mongo_table where key = 42") + == hex(42 * 42) + "\n" + ) + node.query("DROP TABLE simple_mongo_table") + simple_mongo_table.drop() + + +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) +def test_arrays(started_cluster): + mongo_connection = get_mongo_connection(started_cluster) + db = mongo_connection["test"] + db.add_user("root", "clickhouse") + arrays_mongo_table = db["arrays_table"] + data = [] + for i in range(0, 100): + data.append( + { + "key": i, + "arr_int64": [-(i + 1), -(i + 2), -(i + 3)], + "arr_int32": [-(i + 1), -(i + 2), -(i + 3)], + "arr_int16": [-(i + 1), -(i + 2), -(i + 3)], + "arr_int8": [-(i + 1), -(i + 2), -(i + 3)], + "arr_uint64": [i + 1, i + 2, i + 3], + "arr_uint32": [i + 1, i + 2, i + 3], + "arr_uint16": [i + 1, i + 2, i + 3], + "arr_uint8": [i + 1, i + 2, i + 3], + "arr_float32": [i + 1.125, i + 2.5, i + 3.750], + "arr_float64": [i + 1.125, i + 2.5, i + 3.750], + "arr_date": [ + datetime.datetime(2002, 10, 27), + datetime.datetime(2024, 1, 8), + ], + "arr_datetime": [ + datetime.datetime(2023, 3, 31, 6, 3, 12), + datetime.datetime(1999, 2, 28, 12, 46, 34), + ], + "arr_string": [str(i + 1), str(i + 2), str(i + 3)], + "arr_uuid": [ + "f0e77736-91d1-48ce-8f01-15123ca1c7ed", + "93376a07-c044-4281-a76e-ad27cf6973c5", + ], + "arr_arr_bool": [ + [True, False, True], + [True], + [], + None, + [False], + [None], + ], + "arr_empty": [], + "arr_null": None, + "arr_nullable": None, + } + ) + + arrays_mongo_table.insert_many(data) + + node = started_cluster.instances["node"] + node.query( + "CREATE TABLE arrays_mongo_table(" + "key UInt64," + "arr_int64 Array(Int64)," + "arr_int32 Array(Int32)," + "arr_int16 Array(Int16)," + "arr_int8 Array(Int8)," + "arr_uint64 Array(UInt64)," + "arr_uint32 Array(UInt32)," + "arr_uint16 Array(UInt16)," + "arr_uint8 Array(UInt8)," + "arr_float32 Array(Float32)," + "arr_float64 Array(Float64)," + "arr_date Array(Date)," + "arr_datetime Array(DateTime)," + "arr_string Array(String)," + "arr_uuid Array(UUID)," + "arr_arr_bool Array(Array(Bool))," + "arr_empty Array(UInt64)," + "arr_null Array(UInt64)," + "arr_arr_null Array(Array(UInt64))," + "arr_nullable Array(Nullable(UInt64))" + ") ENGINE = MongoDB('mongo1:27017', 'test', 'arrays_table', 'root', 'clickhouse')" + ) + + assert node.query("SELECT COUNT() FROM arrays_mongo_table") == "100\n" + + for column_name in ["arr_int64", "arr_int32", "arr_int16", "arr_int8"]: + assert ( + node.query(f"SELECT {column_name} FROM arrays_mongo_table WHERE key = 42") + == "[-43,-44,-45]\n" + ) + + for column_name in ["arr_uint64", "arr_uint32", "arr_uint16", "arr_uint8"]: + assert ( + node.query(f"SELECT {column_name} FROM arrays_mongo_table WHERE key = 42") + == "[43,44,45]\n" + ) + + for column_name in ["arr_float32", "arr_float64"]: + assert ( + node.query(f"SELECT {column_name} FROM arrays_mongo_table WHERE key = 42") + == "[43.125,44.5,45.75]\n" + ) + + assert ( + node.query(f"SELECT arr_date FROM arrays_mongo_table WHERE key = 42") + == "['2002-10-27','2024-01-08']\n" + ) + + assert ( + node.query(f"SELECT arr_datetime FROM arrays_mongo_table WHERE key = 42") + == "['2023-03-31 06:03:12','1999-02-28 12:46:34']\n" + ) + + assert ( + node.query(f"SELECT arr_string FROM arrays_mongo_table WHERE key = 42") + == "['43','44','45']\n" + ) + + assert ( + node.query(f"SELECT arr_uuid FROM arrays_mongo_table WHERE key = 42") + == "['f0e77736-91d1-48ce-8f01-15123ca1c7ed','93376a07-c044-4281-a76e-ad27cf6973c5']\n" + ) + + assert ( + node.query(f"SELECT arr_arr_bool FROM arrays_mongo_table WHERE key = 42") + == "[[true,false,true],[true],[],[],[false],[false]]\n" + ) + + assert ( + node.query(f"SELECT arr_empty FROM arrays_mongo_table WHERE key = 42") == "[]\n" + ) + + assert ( + node.query(f"SELECT arr_null FROM arrays_mongo_table WHERE key = 42") == "[]\n" + ) + + assert ( + node.query(f"SELECT arr_arr_null FROM arrays_mongo_table WHERE key = 42") + == "[]\n" + ) + + assert ( + node.query(f"SELECT arr_nullable FROM arrays_mongo_table WHERE key = 42") + == "[]\n" + ) + + node.query("DROP TABLE arrays_mongo_table") + arrays_mongo_table.drop() + + +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) +def test_complex_data_type(started_cluster): + mongo_connection = get_mongo_connection(started_cluster) + db = mongo_connection["test"] + db.add_user("root", "clickhouse") + incomplete_mongo_table = db["complex_table"] + data = [] + for i in range(0, 100): + data.append({"key": i, "data": hex(i * i), "dict": {"a": i, "b": str(i)}}) + incomplete_mongo_table.insert_many(data) + + node = started_cluster.instances["node"] + node.query( + "CREATE TABLE incomplete_mongo_table(key UInt64, data String) ENGINE = MongoDB('mongo1:27017', 'test', 'complex_table', 'root', 'clickhouse')" + ) + + assert node.query("SELECT COUNT() FROM incomplete_mongo_table") == "100\n" + assert ( + node.query("SELECT sum(key) FROM incomplete_mongo_table") + == str(sum(range(0, 100))) + "\n" + ) + + assert ( + node.query("SELECT data from incomplete_mongo_table where key = 42") + == hex(42 * 42) + "\n" + ) + node.query("DROP TABLE incomplete_mongo_table") + incomplete_mongo_table.drop() + + +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) +def test_incorrect_data_type(started_cluster): + mongo_connection = get_mongo_connection(started_cluster) + db = mongo_connection["test"] + db.add_user("root", "clickhouse") + strange_mongo_table = db["strange_table"] + data = [] + for i in range(0, 100): + data.append({"key": i, "data": hex(i * i), "aaaa": "Hello"}) + strange_mongo_table.insert_many(data) + + node = started_cluster.instances["node"] + node.query( + "CREATE TABLE strange_mongo_table(key String, data String) ENGINE = MongoDB('mongo1:27017', 'test', 'strange_table', 'root', 'clickhouse')" + ) + + with pytest.raises(QueryRuntimeException): + node.query("SELECT COUNT() FROM strange_mongo_table") + + with pytest.raises(QueryRuntimeException): + node.query("SELECT uniq(key) FROM strange_mongo_table") + + node.query( + "CREATE TABLE strange_mongo_table2(key UInt64, data String, bbbb String) ENGINE = MongoDB('mongo1:27017', 'test', 'strange_table', 'root', 'clickhouse')" + ) + + node.query("DROP TABLE strange_mongo_table") + node.query("DROP TABLE strange_mongo_table2") + strange_mongo_table.drop() + + +@pytest.mark.parametrize("started_cluster", [True], indirect=["started_cluster"]) +def test_secure_connection(started_cluster): + mongo_connection = get_mongo_connection(started_cluster, secure=True) + db = mongo_connection["test"] + db.add_user("root", "clickhouse") + simple_mongo_table = db["simple_table"] + data = [] + for i in range(0, 100): + data.append({"key": i, "data": hex(i * i)}) + simple_mongo_table.insert_many(data) + + node = started_cluster.instances["node"] + node.query( + "CREATE TABLE simple_mongo_table(key UInt64, data String) ENGINE = MongoDB('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', 'ssl=true')" + ) + + assert node.query("SELECT COUNT() FROM simple_mongo_table") == "100\n" + assert ( + node.query("SELECT sum(key) FROM simple_mongo_table") + == str(sum(range(0, 100))) + "\n" + ) + + assert ( + node.query("SELECT data from simple_mongo_table where key = 42") + == hex(42 * 42) + "\n" + ) + node.query("DROP TABLE simple_mongo_table") + simple_mongo_table.drop() + + +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) +def test_predefined_connection_configuration(started_cluster): + mongo_connection = get_mongo_connection(started_cluster) + db = mongo_connection["test"] + db.add_user("root", "clickhouse") + simple_mongo_table = db["simple_table"] + data = [] + for i in range(0, 100): + data.append({"key": i, "data": hex(i * i)}) + simple_mongo_table.insert_many(data) + + node = started_cluster.instances["node"] + node.query("drop table if exists simple_mongo_table") + node.query( + "create table simple_mongo_table(key UInt64, data String) engine = MongoDB(mongo1)" + ) + assert node.query("SELECT count() FROM simple_mongo_table") == "100\n" + simple_mongo_table.drop() + + +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) +def test_no_credentials(started_cluster): + mongo_connection = get_mongo_connection(started_cluster, with_credentials=False) + db = mongo_connection["test"] + simple_mongo_table = db["simple_table"] + data = [] + for i in range(0, 100): + data.append({"key": i, "data": hex(i * i)}) + simple_mongo_table.insert_many(data) + + node = started_cluster.instances["node"] + node.query( + "create table simple_mongo_table_2(key UInt64, data String) engine = MongoDB('mongo2:27017', 'test', 'simple_table', '', '')" + ) + assert node.query("SELECT count() FROM simple_mongo_table_2") == "100\n" + simple_mongo_table.drop() + + +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) +def test_auth_source(started_cluster): + mongo_connection = get_mongo_connection(started_cluster, with_credentials=False) + admin_db = mongo_connection["admin"] + admin_db.add_user( + "root", + "clickhouse", + roles=[{"role": "userAdminAnyDatabase", "db": "admin"}, "readWriteAnyDatabase"], + ) + simple_mongo_table = admin_db["simple_table"] + data = [] + for i in range(0, 50): + data.append({"key": i, "data": hex(i * i)}) + simple_mongo_table.insert_many(data) + db = mongo_connection["test"] + simple_mongo_table = db["simple_table"] + data = [] + for i in range(0, 100): + data.append({"key": i, "data": hex(i * i)}) + simple_mongo_table.insert_many(data) + + node = started_cluster.instances["node"] + node.query( + "create table simple_mongo_table_fail(key UInt64, data String) engine = MongoDB('mongo2:27017', 'test', 'simple_table', 'root', 'clickhouse')" + ) + node.query_and_get_error("SELECT count() FROM simple_mongo_table_fail") + node.query( + "create table simple_mongo_table_ok(key UInt64, data String) engine = MongoDB('mongo2:27017', 'test', 'simple_table', 'root', 'clickhouse', 'authSource=admin')" + ) + assert node.query("SELECT count() FROM simple_mongo_table_ok") == "100\n" + simple_mongo_table.drop() + + +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) +def test_missing_columns(started_cluster): + mongo_connection = get_mongo_connection(started_cluster) + db = mongo_connection["test"] + db.add_user("root", "clickhouse") + simple_mongo_table = db["simple_table"] + data = [] + for i in range(0, 10): + data.append({"key": i, "data": hex(i * i)}) + for i in range(0, 10): + data.append({"key": i}) + simple_mongo_table.insert_many(data) + + node = started_cluster.instances["node"] + node.query("drop table if exists simple_mongo_table") + node.query( + "create table simple_mongo_table(key UInt64, data Nullable(String)) engine = MongoDB(mongo1)" + ) + result = node.query("SELECT count() FROM simple_mongo_table WHERE isNull(data)") + assert result == "10\n" + simple_mongo_table.drop() + + +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) +def test_simple_insert_select(started_cluster): + mongo_connection = get_mongo_connection(started_cluster) + db = mongo_connection["test"] + db.add_user("root", "clickhouse") + simple_mongo_table = db["simple_table"] + + node = started_cluster.instances["node"] + node.query("DROP TABLE IF EXISTS simple_mongo_table") + node.query( + "CREATE TABLE simple_mongo_table(key UInt64, data String) ENGINE = MongoDB('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse')" + ) + node.query("INSERT INTO simple_mongo_table SELECT 1, 'kek'") + + assert ( + node.query("SELECT data from simple_mongo_table where key = 1").strip() == "kek" + ) + node.query("INSERT INTO simple_mongo_table(key) SELECT 12") + assert int(node.query("SELECT count() from simple_mongo_table")) == 2 + assert ( + node.query("SELECT data from simple_mongo_table where key = 12").strip() == "" + ) + + node.query("DROP TABLE simple_mongo_table") + simple_mongo_table.drop() From 412d9ba259cb91243d28765326d65384a60e18cc Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Thu, 25 May 2023 12:33:07 +0800 Subject: [PATCH 0271/1072] add tests for redis storage --- src/Storages/RedisCommon.cpp | 15 + src/Storages/RedisCommon.h | 2 +- .../configs/named_collections.xml | 12 - tests/integration/test_storage_redis/test.py | 443 ++---------------- .../test_table_function_redis/__init__.py | 0 .../configs_secure/config.d/ssl_conf.xml | 0 .../test_table_function_redis/test.py | 276 +++++++++++ 7 files changed, 329 insertions(+), 419 deletions(-) delete mode 100644 tests/integration/test_storage_redis/configs/named_collections.xml create mode 100644 tests/integration/test_table_function_redis/__init__.py rename tests/integration/{test_storage_redis => test_table_function_redis}/configs_secure/config.d/ssl_conf.xml (100%) create mode 100644 tests/integration/test_table_function_redis/test.py diff --git a/src/Storages/RedisCommon.cpp b/src/Storages/RedisCommon.cpp index 8cc94c45dae..fc789057019 100644 --- a/src/Storages/RedisCommon.cpp +++ b/src/Storages/RedisCommon.cpp @@ -25,6 +25,21 @@ RedisConnection::~RedisConnection() pool->returnObject(std::move(client)); } +String storageTypeToKeyType(RedisStorageType type) +{ + switch (type) + { + case RedisStorageType::SIMPLE: + return "string"; + case RedisStorageType::HASH_MAP: + return "hash"; + default: + return "none"; + } + + UNREACHABLE(); +} + String serializeStorageType(RedisStorageType storage_type) { switch (storage_type) diff --git a/src/Storages/RedisCommon.h b/src/Storages/RedisCommon.h index 02d0b435b9d..d68f2567248 100644 --- a/src/Storages/RedisCommon.h +++ b/src/Storages/RedisCommon.h @@ -37,7 +37,7 @@ extern RedisColumnTypes REDIS_HASH_MAP_COLUMN_TYPES; extern RedisColumnTypes REDIS_SIMPLE_COLUMN_TYPES; /// storage type to Redis key type -String storageTypeToKeyType(RedisStorageType storage_type); +String storageTypeToKeyType(RedisStorageType type); RedisStorageType parseStorageType(const String & storage_type_str); String serializeStorageType(RedisStorageType storage_type); diff --git a/tests/integration/test_storage_redis/configs/named_collections.xml b/tests/integration/test_storage_redis/configs/named_collections.xml deleted file mode 100644 index 5f7db390982..00000000000 --- a/tests/integration/test_storage_redis/configs/named_collections.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - root - clickhouse - mongo1 - 27017 - test - simple_table - - - diff --git a/tests/integration/test_storage_redis/test.py b/tests/integration/test_storage_redis/test.py index 6ba5520704d..4220563c229 100644 --- a/tests/integration/test_storage_redis/test.py +++ b/tests/integration/test_storage_redis/test.py @@ -1,426 +1,57 @@ -import pymongo - +import redis import pytest -from helpers.client import QueryRuntimeException - from helpers.cluster import ClickHouseCluster -import datetime + +cluster = ClickHouseCluster(__file__) + +node = cluster.add_instance("node", with_redis=True) @pytest.fixture(scope="module") -def started_cluster(request): +def started_cluster(): try: - cluster = ClickHouseCluster(__file__) - node = cluster.add_instance( - "node", - main_configs=[ - "configs_secure/config.d/ssl_conf.xml", - "configs/named_collections.xml", - ], - with_mongo=True, - with_mongo_secure=request.param, - ) cluster.start() yield cluster finally: cluster.shutdown() -def get_mongo_connection(started_cluster, secure=False, with_credentials=True): - connection_str = "" - if with_credentials: - connection_str = "mongodb://root:clickhouse@localhost:{}".format( - started_cluster.mongo_port - ) - else: - connection_str = "mongodb://localhost:{}".format( - started_cluster.mongo_no_cred_port - ) - if secure: - connection_str += "/?tls=true&tlsAllowInvalidCertificates=true" - return pymongo.MongoClient(connection_str) +def get_redis_connection(db_id=0): + client = redis.Redis( + host=node.name, port=started_cluster.redis_port, password="clickhouse", db=db_id + ) + return client -@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) -def test_simple_select(started_cluster): - mongo_connection = get_mongo_connection(started_cluster) - db = mongo_connection["test"] - db.add_user("root", "clickhouse") - simple_mongo_table = db["simple_table"] - data = [] - for i in range(0, 100): - data.append({"key": i, "data": hex(i * i)}) - simple_mongo_table.insert_many(data) +def get_address(): + return node.name + started_cluster.redis_port - node = started_cluster.instances["node"] + +@pytest.mark.parametrize("started_cluster") +def test_storage_simple_select(started_cluster): + client = get_redis_connection() + address = get_address() + + data = {} + for i in range(100): + data['key{}'.format(i)] = 'value{}'.format(i) + + client.mset(data) + + # create table node.query( - "CREATE TABLE simple_mongo_table(key UInt64, data String) ENGINE = MongoDB('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse')" + f""" + CREATE TABLE test_storage_simple_select( + k String, + v String + ) Engine=Redis('{address}', 0, '','simple', 10) + """ ) - assert node.query("SELECT COUNT() FROM simple_mongo_table") == "100\n" - assert ( - node.query("SELECT sum(key) FROM simple_mongo_table") - == str(sum(range(0, 100))) + "\n" - ) + select_query = "SELECT k, v from test_storage_simple_select where k='0' FORMAT Values" + assert (node.query(select_query) == "('0','0')") - assert ( - node.query("SELECT data from simple_mongo_table where key = 42") - == hex(42 * 42) + "\n" - ) - node.query("DROP TABLE simple_mongo_table") - simple_mongo_table.drop() + select_query = "SELECT * from test_storage_simple_select FORMAT Values" + assert (len(node.query(select_query)) == 100) + assert (node.query(select_query)[0] == "('0','0')") - -@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) -def test_arrays(started_cluster): - mongo_connection = get_mongo_connection(started_cluster) - db = mongo_connection["test"] - db.add_user("root", "clickhouse") - arrays_mongo_table = db["arrays_table"] - data = [] - for i in range(0, 100): - data.append( - { - "key": i, - "arr_int64": [-(i + 1), -(i + 2), -(i + 3)], - "arr_int32": [-(i + 1), -(i + 2), -(i + 3)], - "arr_int16": [-(i + 1), -(i + 2), -(i + 3)], - "arr_int8": [-(i + 1), -(i + 2), -(i + 3)], - "arr_uint64": [i + 1, i + 2, i + 3], - "arr_uint32": [i + 1, i + 2, i + 3], - "arr_uint16": [i + 1, i + 2, i + 3], - "arr_uint8": [i + 1, i + 2, i + 3], - "arr_float32": [i + 1.125, i + 2.5, i + 3.750], - "arr_float64": [i + 1.125, i + 2.5, i + 3.750], - "arr_date": [ - datetime.datetime(2002, 10, 27), - datetime.datetime(2024, 1, 8), - ], - "arr_datetime": [ - datetime.datetime(2023, 3, 31, 6, 3, 12), - datetime.datetime(1999, 2, 28, 12, 46, 34), - ], - "arr_string": [str(i + 1), str(i + 2), str(i + 3)], - "arr_uuid": [ - "f0e77736-91d1-48ce-8f01-15123ca1c7ed", - "93376a07-c044-4281-a76e-ad27cf6973c5", - ], - "arr_arr_bool": [ - [True, False, True], - [True], - [], - None, - [False], - [None], - ], - "arr_empty": [], - "arr_null": None, - "arr_nullable": None, - } - ) - - arrays_mongo_table.insert_many(data) - - node = started_cluster.instances["node"] - node.query( - "CREATE TABLE arrays_mongo_table(" - "key UInt64," - "arr_int64 Array(Int64)," - "arr_int32 Array(Int32)," - "arr_int16 Array(Int16)," - "arr_int8 Array(Int8)," - "arr_uint64 Array(UInt64)," - "arr_uint32 Array(UInt32)," - "arr_uint16 Array(UInt16)," - "arr_uint8 Array(UInt8)," - "arr_float32 Array(Float32)," - "arr_float64 Array(Float64)," - "arr_date Array(Date)," - "arr_datetime Array(DateTime)," - "arr_string Array(String)," - "arr_uuid Array(UUID)," - "arr_arr_bool Array(Array(Bool))," - "arr_empty Array(UInt64)," - "arr_null Array(UInt64)," - "arr_arr_null Array(Array(UInt64))," - "arr_nullable Array(Nullable(UInt64))" - ") ENGINE = MongoDB('mongo1:27017', 'test', 'arrays_table', 'root', 'clickhouse')" - ) - - assert node.query("SELECT COUNT() FROM arrays_mongo_table") == "100\n" - - for column_name in ["arr_int64", "arr_int32", "arr_int16", "arr_int8"]: - assert ( - node.query(f"SELECT {column_name} FROM arrays_mongo_table WHERE key = 42") - == "[-43,-44,-45]\n" - ) - - for column_name in ["arr_uint64", "arr_uint32", "arr_uint16", "arr_uint8"]: - assert ( - node.query(f"SELECT {column_name} FROM arrays_mongo_table WHERE key = 42") - == "[43,44,45]\n" - ) - - for column_name in ["arr_float32", "arr_float64"]: - assert ( - node.query(f"SELECT {column_name} FROM arrays_mongo_table WHERE key = 42") - == "[43.125,44.5,45.75]\n" - ) - - assert ( - node.query(f"SELECT arr_date FROM arrays_mongo_table WHERE key = 42") - == "['2002-10-27','2024-01-08']\n" - ) - - assert ( - node.query(f"SELECT arr_datetime FROM arrays_mongo_table WHERE key = 42") - == "['2023-03-31 06:03:12','1999-02-28 12:46:34']\n" - ) - - assert ( - node.query(f"SELECT arr_string FROM arrays_mongo_table WHERE key = 42") - == "['43','44','45']\n" - ) - - assert ( - node.query(f"SELECT arr_uuid FROM arrays_mongo_table WHERE key = 42") - == "['f0e77736-91d1-48ce-8f01-15123ca1c7ed','93376a07-c044-4281-a76e-ad27cf6973c5']\n" - ) - - assert ( - node.query(f"SELECT arr_arr_bool FROM arrays_mongo_table WHERE key = 42") - == "[[true,false,true],[true],[],[],[false],[false]]\n" - ) - - assert ( - node.query(f"SELECT arr_empty FROM arrays_mongo_table WHERE key = 42") == "[]\n" - ) - - assert ( - node.query(f"SELECT arr_null FROM arrays_mongo_table WHERE key = 42") == "[]\n" - ) - - assert ( - node.query(f"SELECT arr_arr_null FROM arrays_mongo_table WHERE key = 42") - == "[]\n" - ) - - assert ( - node.query(f"SELECT arr_nullable FROM arrays_mongo_table WHERE key = 42") - == "[]\n" - ) - - node.query("DROP TABLE arrays_mongo_table") - arrays_mongo_table.drop() - - -@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) -def test_complex_data_type(started_cluster): - mongo_connection = get_mongo_connection(started_cluster) - db = mongo_connection["test"] - db.add_user("root", "clickhouse") - incomplete_mongo_table = db["complex_table"] - data = [] - for i in range(0, 100): - data.append({"key": i, "data": hex(i * i), "dict": {"a": i, "b": str(i)}}) - incomplete_mongo_table.insert_many(data) - - node = started_cluster.instances["node"] - node.query( - "CREATE TABLE incomplete_mongo_table(key UInt64, data String) ENGINE = MongoDB('mongo1:27017', 'test', 'complex_table', 'root', 'clickhouse')" - ) - - assert node.query("SELECT COUNT() FROM incomplete_mongo_table") == "100\n" - assert ( - node.query("SELECT sum(key) FROM incomplete_mongo_table") - == str(sum(range(0, 100))) + "\n" - ) - - assert ( - node.query("SELECT data from incomplete_mongo_table where key = 42") - == hex(42 * 42) + "\n" - ) - node.query("DROP TABLE incomplete_mongo_table") - incomplete_mongo_table.drop() - - -@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) -def test_incorrect_data_type(started_cluster): - mongo_connection = get_mongo_connection(started_cluster) - db = mongo_connection["test"] - db.add_user("root", "clickhouse") - strange_mongo_table = db["strange_table"] - data = [] - for i in range(0, 100): - data.append({"key": i, "data": hex(i * i), "aaaa": "Hello"}) - strange_mongo_table.insert_many(data) - - node = started_cluster.instances["node"] - node.query( - "CREATE TABLE strange_mongo_table(key String, data String) ENGINE = MongoDB('mongo1:27017', 'test', 'strange_table', 'root', 'clickhouse')" - ) - - with pytest.raises(QueryRuntimeException): - node.query("SELECT COUNT() FROM strange_mongo_table") - - with pytest.raises(QueryRuntimeException): - node.query("SELECT uniq(key) FROM strange_mongo_table") - - node.query( - "CREATE TABLE strange_mongo_table2(key UInt64, data String, bbbb String) ENGINE = MongoDB('mongo1:27017', 'test', 'strange_table', 'root', 'clickhouse')" - ) - - node.query("DROP TABLE strange_mongo_table") - node.query("DROP TABLE strange_mongo_table2") - strange_mongo_table.drop() - - -@pytest.mark.parametrize("started_cluster", [True], indirect=["started_cluster"]) -def test_secure_connection(started_cluster): - mongo_connection = get_mongo_connection(started_cluster, secure=True) - db = mongo_connection["test"] - db.add_user("root", "clickhouse") - simple_mongo_table = db["simple_table"] - data = [] - for i in range(0, 100): - data.append({"key": i, "data": hex(i * i)}) - simple_mongo_table.insert_many(data) - - node = started_cluster.instances["node"] - node.query( - "CREATE TABLE simple_mongo_table(key UInt64, data String) ENGINE = MongoDB('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', 'ssl=true')" - ) - - assert node.query("SELECT COUNT() FROM simple_mongo_table") == "100\n" - assert ( - node.query("SELECT sum(key) FROM simple_mongo_table") - == str(sum(range(0, 100))) + "\n" - ) - - assert ( - node.query("SELECT data from simple_mongo_table where key = 42") - == hex(42 * 42) + "\n" - ) - node.query("DROP TABLE simple_mongo_table") - simple_mongo_table.drop() - - -@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) -def test_predefined_connection_configuration(started_cluster): - mongo_connection = get_mongo_connection(started_cluster) - db = mongo_connection["test"] - db.add_user("root", "clickhouse") - simple_mongo_table = db["simple_table"] - data = [] - for i in range(0, 100): - data.append({"key": i, "data": hex(i * i)}) - simple_mongo_table.insert_many(data) - - node = started_cluster.instances["node"] - node.query("drop table if exists simple_mongo_table") - node.query( - "create table simple_mongo_table(key UInt64, data String) engine = MongoDB(mongo1)" - ) - assert node.query("SELECT count() FROM simple_mongo_table") == "100\n" - simple_mongo_table.drop() - - -@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) -def test_no_credentials(started_cluster): - mongo_connection = get_mongo_connection(started_cluster, with_credentials=False) - db = mongo_connection["test"] - simple_mongo_table = db["simple_table"] - data = [] - for i in range(0, 100): - data.append({"key": i, "data": hex(i * i)}) - simple_mongo_table.insert_many(data) - - node = started_cluster.instances["node"] - node.query( - "create table simple_mongo_table_2(key UInt64, data String) engine = MongoDB('mongo2:27017', 'test', 'simple_table', '', '')" - ) - assert node.query("SELECT count() FROM simple_mongo_table_2") == "100\n" - simple_mongo_table.drop() - - -@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) -def test_auth_source(started_cluster): - mongo_connection = get_mongo_connection(started_cluster, with_credentials=False) - admin_db = mongo_connection["admin"] - admin_db.add_user( - "root", - "clickhouse", - roles=[{"role": "userAdminAnyDatabase", "db": "admin"}, "readWriteAnyDatabase"], - ) - simple_mongo_table = admin_db["simple_table"] - data = [] - for i in range(0, 50): - data.append({"key": i, "data": hex(i * i)}) - simple_mongo_table.insert_many(data) - db = mongo_connection["test"] - simple_mongo_table = db["simple_table"] - data = [] - for i in range(0, 100): - data.append({"key": i, "data": hex(i * i)}) - simple_mongo_table.insert_many(data) - - node = started_cluster.instances["node"] - node.query( - "create table simple_mongo_table_fail(key UInt64, data String) engine = MongoDB('mongo2:27017', 'test', 'simple_table', 'root', 'clickhouse')" - ) - node.query_and_get_error("SELECT count() FROM simple_mongo_table_fail") - node.query( - "create table simple_mongo_table_ok(key UInt64, data String) engine = MongoDB('mongo2:27017', 'test', 'simple_table', 'root', 'clickhouse', 'authSource=admin')" - ) - assert node.query("SELECT count() FROM simple_mongo_table_ok") == "100\n" - simple_mongo_table.drop() - - -@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) -def test_missing_columns(started_cluster): - mongo_connection = get_mongo_connection(started_cluster) - db = mongo_connection["test"] - db.add_user("root", "clickhouse") - simple_mongo_table = db["simple_table"] - data = [] - for i in range(0, 10): - data.append({"key": i, "data": hex(i * i)}) - for i in range(0, 10): - data.append({"key": i}) - simple_mongo_table.insert_many(data) - - node = started_cluster.instances["node"] - node.query("drop table if exists simple_mongo_table") - node.query( - "create table simple_mongo_table(key UInt64, data Nullable(String)) engine = MongoDB(mongo1)" - ) - result = node.query("SELECT count() FROM simple_mongo_table WHERE isNull(data)") - assert result == "10\n" - simple_mongo_table.drop() - - -@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) -def test_simple_insert_select(started_cluster): - mongo_connection = get_mongo_connection(started_cluster) - db = mongo_connection["test"] - db.add_user("root", "clickhouse") - simple_mongo_table = db["simple_table"] - - node = started_cluster.instances["node"] - node.query("DROP TABLE IF EXISTS simple_mongo_table") - node.query( - "CREATE TABLE simple_mongo_table(key UInt64, data String) ENGINE = MongoDB('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse')" - ) - node.query("INSERT INTO simple_mongo_table SELECT 1, 'kek'") - - assert ( - node.query("SELECT data from simple_mongo_table where key = 1").strip() == "kek" - ) - node.query("INSERT INTO simple_mongo_table(key) SELECT 12") - assert int(node.query("SELECT count() from simple_mongo_table")) == 2 - assert ( - node.query("SELECT data from simple_mongo_table where key = 12").strip() == "" - ) - - node.query("DROP TABLE simple_mongo_table") - simple_mongo_table.drop() diff --git a/tests/integration/test_table_function_redis/__init__.py b/tests/integration/test_table_function_redis/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_storage_redis/configs_secure/config.d/ssl_conf.xml b/tests/integration/test_table_function_redis/configs_secure/config.d/ssl_conf.xml similarity index 100% rename from tests/integration/test_storage_redis/configs_secure/config.d/ssl_conf.xml rename to tests/integration/test_table_function_redis/configs_secure/config.d/ssl_conf.xml diff --git a/tests/integration/test_table_function_redis/test.py b/tests/integration/test_table_function_redis/test.py new file mode 100644 index 00000000000..e0ad71b0079 --- /dev/null +++ b/tests/integration/test_table_function_redis/test.py @@ -0,0 +1,276 @@ +import pymongo + +import pytest +from helpers.client import QueryRuntimeException + +from helpers.cluster import ClickHouseCluster + + +@pytest.fixture(scope="module") +def started_cluster(request): + try: + cluster = ClickHouseCluster(__file__) + node = cluster.add_instance( + "node", + with_mongo=True, + main_configs=[ + "configs_secure/config.d/ssl_conf.xml", + ], + with_mongo_secure=request.param, + ) + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def get_mongo_connection(started_cluster, secure=False, with_credentials=True): + connection_str = "" + if with_credentials: + connection_str = "mongodb://root:clickhouse@localhost:{}".format( + started_cluster.mongo_port + ) + else: + connection_str = "mongodb://localhost:{}".format( + started_cluster.mongo_no_cred_port + ) + if secure: + connection_str += "/?tls=true&tlsAllowInvalidCertificates=true" + return pymongo.MongoClient(connection_str) + + +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) +def test_simple_select(started_cluster): + mongo_connection = get_mongo_connection(started_cluster) + db = mongo_connection["test"] + db.add_user("root", "clickhouse") + simple_mongo_table = db["simple_table"] + + node = started_cluster.instances["node"] + for i in range(0, 100): + node.query( + "INSERT INTO FUNCTION mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String') (key, data) VALUES ({}, '{}')".format( + i, hex(i * i) + ) + ) + assert ( + node.query( + "SELECT COUNT() FROM mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String')" + ) + == "100\n" + ) + assert ( + node.query( + "SELECT sum(key) FROM mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String')" + ) + == str(sum(range(0, 100))) + "\n" + ) + assert ( + node.query( + "SELECT sum(key) FROM mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', 'key UInt64, data String')" + ) + == str(sum(range(0, 100))) + "\n" + ) + + assert ( + node.query( + "SELECT data from mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String') where key = 42" + ) + == hex(42 * 42) + "\n" + ) + simple_mongo_table.drop() + + +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) +def test_complex_data_type(started_cluster): + mongo_connection = get_mongo_connection(started_cluster) + db = mongo_connection["test"] + db.add_user("root", "clickhouse") + incomplete_mongo_table = db["complex_table"] + data = [] + for i in range(0, 100): + data.append({"key": i, "data": hex(i * i), "dict": {"a": i, "b": str(i)}}) + incomplete_mongo_table.insert_many(data) + + node = started_cluster.instances["node"] + + assert ( + node.query( + "SELECT COUNT() FROM mongodb('mongo1:27017', 'test', 'complex_table', 'root', 'clickhouse', structure='key UInt64, data String, dict Map(UInt64, String)')" + ) + == "100\n" + ) + assert ( + node.query( + "SELECT sum(key) FROM mongodb('mongo1:27017', 'test', 'complex_table', 'root', 'clickhouse', structure='key UInt64, data String, dict Map(UInt64, String)')" + ) + == str(sum(range(0, 100))) + "\n" + ) + + assert ( + node.query( + "SELECT data from mongodb('mongo1:27017', 'test', 'complex_table', 'root', 'clickhouse', structure='key UInt64, data String, dict Map(UInt64, String)') where key = 42" + ) + == hex(42 * 42) + "\n" + ) + incomplete_mongo_table.drop() + + +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) +def test_incorrect_data_type(started_cluster): + mongo_connection = get_mongo_connection(started_cluster) + db = mongo_connection["test"] + db.add_user("root", "clickhouse") + strange_mongo_table = db["strange_table"] + data = [] + for i in range(0, 100): + data.append({"key": i, "data": hex(i * i), "aaaa": "Hello"}) + strange_mongo_table.insert_many(data) + + node = started_cluster.instances["node"] + + with pytest.raises(QueryRuntimeException): + node.query( + "SELECT aaaa FROM mongodb('mongo1:27017', 'test', 'strange_table', 'root', 'clickhouse', structure='key UInt64, data String')" + ) + + strange_mongo_table.drop() + + +@pytest.mark.parametrize("started_cluster", [True], indirect=["started_cluster"]) +def test_secure_connection(started_cluster): + mongo_connection = get_mongo_connection(started_cluster, secure=True) + db = mongo_connection["test"] + db.add_user("root", "clickhouse") + simple_mongo_table = db["simple_table"] + data = [] + for i in range(0, 100): + data.append({"key": i, "data": hex(i * i)}) + simple_mongo_table.insert_many(data) + + node = started_cluster.instances["node"] + + assert ( + node.query( + "SELECT COUNT() FROM mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String', options='ssl=true')" + ) + == "100\n" + ) + assert ( + node.query( + "SELECT sum(key) FROM mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String', options='ssl=true')" + ) + == str(sum(range(0, 100))) + "\n" + ) + assert ( + node.query( + "SELECT sum(key) FROM mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', 'key UInt64, data String', 'ssl=true')" + ) + == str(sum(range(0, 100))) + "\n" + ) + + assert ( + node.query( + "SELECT data from mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String', options='ssl=true') where key = 42" + ) + == hex(42 * 42) + "\n" + ) + simple_mongo_table.drop() + + +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) +def test_predefined_connection_configuration(started_cluster): + mongo_connection = get_mongo_connection(started_cluster) + db = mongo_connection["test"] + db.add_user("root", "clickhouse") + simple_mongo_table = db["simple_table"] + data = [] + for i in range(0, 100): + data.append({"key": i, "data": hex(i * i)}) + simple_mongo_table.insert_many(data) + + node = started_cluster.instances["node"] + assert ( + node.query( + "SELECT count() FROM mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String')" + ) + == "100\n" + ) + simple_mongo_table.drop() + + +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) +def test_no_credentials(started_cluster): + mongo_connection = get_mongo_connection(started_cluster, with_credentials=False) + db = mongo_connection["test"] + simple_mongo_table = db["simple_table"] + data = [] + for i in range(0, 100): + data.append({"key": i, "data": hex(i * i)}) + simple_mongo_table.insert_many(data) + + node = started_cluster.instances["node"] + assert ( + node.query( + "SELECT count() FROM mongodb('mongo2:27017', 'test', 'simple_table', '', '', structure='key UInt64, data String')" + ) + == "100\n" + ) + simple_mongo_table.drop() + + +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) +def test_auth_source(started_cluster): + mongo_connection = get_mongo_connection(started_cluster, with_credentials=False) + admin_db = mongo_connection["admin"] + admin_db.add_user( + "root", + "clickhouse", + roles=[{"role": "userAdminAnyDatabase", "db": "admin"}, "readWriteAnyDatabase"], + ) + simple_mongo_table = admin_db["simple_table"] + data = [] + for i in range(0, 50): + data.append({"key": i, "data": hex(i * i)}) + simple_mongo_table.insert_many(data) + db = mongo_connection["test"] + simple_mongo_table = db["simple_table"] + data = [] + for i in range(0, 100): + data.append({"key": i, "data": hex(i * i)}) + simple_mongo_table.insert_many(data) + + node = started_cluster.instances["node"] + + node.query_and_get_error( + "SELECT count() FROM mongodb('mongo2:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String')" + ) + + assert ( + node.query( + "SELECT count() FROM mongodb('mongo2:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String', options='authSource=admin')" + ) + == "100\n" + ) + simple_mongo_table.drop() + + +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) +def test_missing_columns(started_cluster): + mongo_connection = get_mongo_connection(started_cluster) + db = mongo_connection["test"] + db.add_user("root", "clickhouse") + simple_mongo_table = db["simple_table"] + data = [] + for i in range(0, 10): + data.append({"key": i, "data": hex(i * i)}) + for i in range(0, 10): + data.append({"key": i}) + simple_mongo_table.insert_many(data) + + node = started_cluster.instances["node"] + result = node.query( + "SELECT count() FROM mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data Nullable(String)') WHERE isNull(data)" + ) + assert result == "10\n" + simple_mongo_table.drop() From 3281aec3357777845f3899d159c2282f07715073 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Thu, 25 May 2023 17:12:56 +0800 Subject: [PATCH 0272/1072] make some redis engine args optional --- src/Storages/StorageRedis.cpp | 30 +++++++++++++++++++++--------- src/Storages/StorageRedis.h | 2 +- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index 819ab01d733..0cc0e566d5c 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -166,6 +166,10 @@ SinkToStoragePtr StorageRedis::write( RedisConfiguration StorageRedis::getConfiguration(ASTs engine_args, ContextPtr context) { RedisConfiguration configuration; + configuration.db_index = 0; + configuration.password = ""; + configuration.storage_type = RedisStorageType::SIMPLE; + configuration.pool_size = 10; if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, context)) { @@ -175,11 +179,15 @@ RedisConfiguration StorageRedis::getConfiguration(ASTs engine_args, ContextPtr c {}); configuration.host = named_collection->getAny({"host", "hostname"}); - configuration.port = static_cast(named_collection->get("port")); - configuration.password = named_collection->get("password"); - configuration.db_index = static_cast(named_collection->get({"db_index"})); - configuration.storage_type = parseStorageType(named_collection->getOrDefault("storage_type", "")); - configuration.pool_size = static_cast(named_collection->get("pool_size")); + configuration.port = static_cast(named_collection->getOrDefault("port", 6379)); + if (engine_args.size() > 1) + configuration.password = named_collection->get("password"); + if (engine_args.size() > 2) + configuration.db_index = static_cast(named_collection->get("db_index")); + if (engine_args.size() > 3) + configuration.storage_type = parseStorageType(named_collection->get("storage_type")); + if (engine_args.size() > 4) + configuration.pool_size = static_cast(named_collection->get("pool_size")); } else { @@ -191,10 +199,14 @@ RedisConfiguration StorageRedis::getConfiguration(ASTs engine_args, ContextPtr c configuration.host = parsed_host_port.first; configuration.port = parsed_host_port.second; - configuration.db_index = static_cast(checkAndGetLiteralArgument(engine_args[1], "db_index")); - configuration.password = checkAndGetLiteralArgument(engine_args[2], "password"); - configuration.storage_type = parseStorageType(checkAndGetLiteralArgument(engine_args[3], "storage_type")); - configuration.pool_size = static_cast(checkAndGetLiteralArgument(engine_args[4], "pool_size")); + if (engine_args.size() > 1) + configuration.db_index = static_cast(checkAndGetLiteralArgument(engine_args[1], "db_index")); + if (engine_args.size() > 2) + configuration.password = checkAndGetLiteralArgument(engine_args[2], "password"); + if (engine_args.size() > 3) + configuration.storage_type = parseStorageType(checkAndGetLiteralArgument(engine_args[3], "storage_type")); + if (engine_args.size() > 4) + configuration.pool_size = static_cast(checkAndGetLiteralArgument(engine_args[4], "pool_size")); } if (configuration.storage_type == RedisStorageType::UNKNOWN) diff --git a/src/Storages/StorageRedis.h b/src/Storages/StorageRedis.h index 2c6c6193982..1ae90b2d1ba 100644 --- a/src/Storages/StorageRedis.h +++ b/src/Storages/StorageRedis.h @@ -7,7 +7,7 @@ namespace DB { /* Implements storage in the Redis. - * Use ENGINE = Redis(host:port, db_index, password, storage_type, conn_pool_size); + * Use ENGINE = Redis(host:port, db_index, password, storage_type, pool_size); * Read only. * * Note If storage_type is From 3c2b44747299b7b87d7e1e1c4219409ade1d7c34 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Thu, 25 May 2023 17:29:22 +0800 Subject: [PATCH 0273/1072] move get configuration to RedisCommon --- src/Storages/RedisCommon.cpp | 57 ++++++++++++++++++++++- src/Storages/RedisCommon.h | 3 ++ src/Storages/StorageRedis.cpp | 54 --------------------- src/Storages/StorageRedis.h | 2 - src/TableFunctions/TableFunctionRedis.cpp | 2 +- 5 files changed, 59 insertions(+), 59 deletions(-) diff --git a/src/Storages/RedisCommon.cpp b/src/Storages/RedisCommon.cpp index fc789057019..0d33b9c7aa3 100644 --- a/src/Storages/RedisCommon.cpp +++ b/src/Storages/RedisCommon.cpp @@ -151,13 +151,13 @@ RedisArrayPtr getRedisHashMapKeys(const RedisConnectionPtr & connection, RedisAr RedisColumnType getRedisColumnType(RedisStorageType storage_type, const Names & all_columns, const String & column) { - String redis_col_key = all_columns.at(0); + const String & redis_col_key = all_columns.at(0); if (column == redis_col_key) return RedisColumnType::KEY; if (storage_type == RedisStorageType::HASH_MAP) { - String redis_col_field = all_columns.at(1); + const String & redis_col_field = all_columns.at(1); if (column == redis_col_field) return RedisColumnType::FIELD; else @@ -169,6 +169,59 @@ RedisColumnType getRedisColumnType(RedisStorageType storage_type, const Names & } } +RedisConfiguration getRedisConfiguration(const ASTs & engine_args, ContextPtr context) +{ + RedisConfiguration configuration; + configuration.db_index = 0; + configuration.password = ""; + configuration.storage_type = RedisStorageType::SIMPLE; + configuration.pool_size = 10; + + if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, context)) + { + validateNamedCollection( + *named_collection, + ValidateKeysMultiset{"host", "port", "hostname", "password", "db_index", "storage_type", "pool_size"}, + {}); + + configuration.host = named_collection->getAny({"host", "hostname"}); + configuration.port = static_cast(named_collection->getOrDefault("port", 6379)); + if (engine_args.size() > 1) + configuration.password = named_collection->get("password"); + if (engine_args.size() > 2) + configuration.db_index = static_cast(named_collection->get("db_index")); + if (engine_args.size() > 3) + configuration.storage_type = parseStorageType(named_collection->get("storage_type")); + if (engine_args.size() > 4) + configuration.pool_size = static_cast(named_collection->get("pool_size")); + } + else + { + for (auto & engine_arg : engine_args) + engine_arg = evaluateConstantExpressionOrIdentifierAsLiteral(engine_arg, context); + + /// 6379 is the default Redis port. + auto parsed_host_port = parseAddress(checkAndGetLiteralArgument(engine_args[0], "host:port"), 6379); + + configuration.host = parsed_host_port.first; + configuration.port = parsed_host_port.second; + if (engine_args.size() > 1) + configuration.db_index = static_cast(checkAndGetLiteralArgument(engine_args[1], "db_index")); + if (engine_args.size() > 2) + configuration.password = checkAndGetLiteralArgument(engine_args[2], "password"); + if (engine_args.size() > 3) + configuration.storage_type = parseStorageType(checkAndGetLiteralArgument(engine_args[3], "storage_type")); + if (engine_args.size() > 4) + configuration.pool_size = static_cast(checkAndGetLiteralArgument(engine_args[4], "pool_size")); + } + + if (configuration.storage_type == RedisStorageType::UNKNOWN) + throw Exception(ErrorCodes::INVALID_REDIS_STORAGE_TYPE, "Invalid Redis storage type"); + + context->getRemoteHostFilter().checkHostAndPort(configuration.host, toString(configuration.port)); + return configuration; +} + void checkRedisTableStructure(const ColumnsDescription & columns, const RedisConfiguration & configuration) { /// TODO check data type diff --git a/src/Storages/RedisCommon.h b/src/Storages/RedisCommon.h index d68f2567248..30e771d2471 100644 --- a/src/Storages/RedisCommon.h +++ b/src/Storages/RedisCommon.h @@ -83,6 +83,9 @@ RedisArrayPtr getRedisHashMapKeys(const RedisConnectionPtr & connection, RedisAr /// HASH_MAP: all_columns must have 2 items and the first one is Redis key the second is field, the third is value. RedisColumnType getRedisColumnType(RedisStorageType storage_type, const Names & all_columns, const String & column); +/// parse redis table engine/function configuration from engine_args +RedisConfiguration getRedisConfiguration(const ASTs & engine_args, ContextPtr context); + /// checking Redis table/table-function when creating void checkRedisTableStructure(const ColumnsDescription & columns, const RedisConfiguration & configuration); diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index 0cc0e566d5c..5010aada8c4 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -162,60 +162,6 @@ SinkToStoragePtr StorageRedis::write( throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Write is unsupported for StorageRedis"); } -/// TODO make "password", "db_index", "storage_type", "pool_size" optional -RedisConfiguration StorageRedis::getConfiguration(ASTs engine_args, ContextPtr context) -{ - RedisConfiguration configuration; - configuration.db_index = 0; - configuration.password = ""; - configuration.storage_type = RedisStorageType::SIMPLE; - configuration.pool_size = 10; - - if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, context)) - { - validateNamedCollection( - *named_collection, - ValidateKeysMultiset{"host", "port", "hostname", "password", "db_index", "storage_type", "pool_size"}, - {}); - - configuration.host = named_collection->getAny({"host", "hostname"}); - configuration.port = static_cast(named_collection->getOrDefault("port", 6379)); - if (engine_args.size() > 1) - configuration.password = named_collection->get("password"); - if (engine_args.size() > 2) - configuration.db_index = static_cast(named_collection->get("db_index")); - if (engine_args.size() > 3) - configuration.storage_type = parseStorageType(named_collection->get("storage_type")); - if (engine_args.size() > 4) - configuration.pool_size = static_cast(named_collection->get("pool_size")); - } - else - { - for (auto & engine_arg : engine_args) - engine_arg = evaluateConstantExpressionOrIdentifierAsLiteral(engine_arg, context); - - /// 6379 is the default Redis port. - auto parsed_host_port = parseAddress(checkAndGetLiteralArgument(engine_args[0], "host:port"), 6379); - - configuration.host = parsed_host_port.first; - configuration.port = parsed_host_port.second; - if (engine_args.size() > 1) - configuration.db_index = static_cast(checkAndGetLiteralArgument(engine_args[1], "db_index")); - if (engine_args.size() > 2) - configuration.password = checkAndGetLiteralArgument(engine_args[2], "password"); - if (engine_args.size() > 3) - configuration.storage_type = parseStorageType(checkAndGetLiteralArgument(engine_args[3], "storage_type")); - if (engine_args.size() > 4) - configuration.pool_size = static_cast(checkAndGetLiteralArgument(engine_args[4], "pool_size")); - } - - if (configuration.storage_type == RedisStorageType::UNKNOWN) - throw Exception(ErrorCodes::INVALID_REDIS_STORAGE_TYPE, "Invalid Redis storage type"); - - context->getRemoteHostFilter().checkHostAndPort(configuration.host, toString(configuration.port)); - return configuration; -} - void registerStorageRedis(StorageFactory & factory) { factory.registerStorage( diff --git a/src/Storages/StorageRedis.h b/src/Storages/StorageRedis.h index 1ae90b2d1ba..619a83f3851 100644 --- a/src/Storages/StorageRedis.h +++ b/src/Storages/StorageRedis.h @@ -40,8 +40,6 @@ public: const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; - static RedisConfiguration getConfiguration(ASTs engine_args, ContextPtr context); - private: StorageID table_id; RedisConfiguration configuration; diff --git a/src/TableFunctions/TableFunctionRedis.cpp b/src/TableFunctions/TableFunctionRedis.cpp index 9e4a39b1b85..cd08837aae2 100644 --- a/src/TableFunctions/TableFunctionRedis.cpp +++ b/src/TableFunctions/TableFunctionRedis.cpp @@ -74,7 +74,7 @@ void TableFunctionRedis::parseArguments(const ASTPtr & ast_function, ContextPtr "Table function 'Redis' requires from 5 parameters: " "redis('host:port', db_index, 'password', 'storage_type', 'pool_size')"); } - configuration = StorageRedis::getConfiguration(args, context); + configuration = getRedisConfiguration(args, context); } From ff961834d674a8ce099da43811eb2eedf3b0d011 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Thu, 25 May 2023 17:29:49 +0800 Subject: [PATCH 0274/1072] add tests to redis engine --- tests/integration/test_storage_redis/test.py | 151 +++++++++++++++++-- 1 file changed, 138 insertions(+), 13 deletions(-) diff --git a/tests/integration/test_storage_redis/test.py b/tests/integration/test_storage_redis/test.py index 4220563c229..af01c2e9ff1 100644 --- a/tests/integration/test_storage_redis/test.py +++ b/tests/integration/test_storage_redis/test.py @@ -1,6 +1,11 @@ +import time + import redis import pytest + +from helpers.client import QueryRuntimeException from helpers.cluster import ClickHouseCluster +from helpers.test_tools import TSV cluster = ClickHouseCluster(__file__) @@ -18,40 +23,160 @@ def started_cluster(): def get_redis_connection(db_id=0): client = redis.Redis( - host=node.name, port=started_cluster.redis_port, password="clickhouse", db=db_id + host='localhost', port=cluster.redis_port, password="clickhouse", db=db_id ) return client -def get_address(): - return node.name + started_cluster.redis_port +def get_address_for_ch(): + return cluster.redis_host + ':6379' + + +def drop_table(table): + node.query(f"DROP TABLE IF EXISTS {table} SYNC"); -@pytest.mark.parametrize("started_cluster") def test_storage_simple_select(started_cluster): client = get_redis_connection() - address = get_address() + address = get_address_for_ch() + + # clean all + client.flushall() + drop_table('test_storage_simple_select') data = {} for i in range(100): - data['key{}'.format(i)] = 'value{}'.format(i) + data[str(i)] = str(i) client.mset(data) + client.close() # create table node.query( f""" CREATE TABLE test_storage_simple_select( k String, - v String - ) Engine=Redis('{address}', 0, '','simple', 10) + v UInt32 + ) Engine=Redis('{address}', 0, 'clickhouse') """ ) - select_query = "SELECT k, v from test_storage_simple_select where k='0' FORMAT Values" - assert (node.query(select_query) == "('0','0')") + response = TSV.toMat(node.query("SELECT k, v from test_storage_simple_select where k='0' FORMAT TSV")) + assert (len(response) == 1) + assert (response[0] == ['0', '0']) - select_query = "SELECT * from test_storage_simple_select FORMAT Values" - assert (len(node.query(select_query)) == 100) - assert (node.query(select_query)[0] == "('0','0')") + response = TSV.toMat(node.query("SELECT * from test_storage_simple_select order by k FORMAT TSV")) + assert (len(response) == 100) + assert (response[0] == ['0', '0']) + + +def test_storage_hash_map_select(started_cluster): + client = get_redis_connection() + address = get_address_for_ch() + + # clean all + client.flushall() + drop_table('test_storage_hash_map_select') + + key = 'k' + data = {} + for i in range(100): + data[str(i)] = str(i) + + client.hset(key, mapping=data) + client.close() + + # create table + node.query( + f""" + CREATE TABLE test_storage_hash_map_select( + k String, + f String, + v UInt32 + ) Engine=Redis('{address}', 0, 'clickhouse','hash_map') + """ + ) + + response = TSV.toMat(node.query("SELECT k, f, v from test_storage_hash_map_select where f='0' FORMAT TSV")) + assert (len(response) == 1) + assert (response[0] == ['k', '0', '0']) + + response = TSV.toMat(node.query("SELECT * from test_storage_hash_map_select FORMAT TSV")) + assert (len(response) == 100) + assert (response[0] == ['k', '0', '0']) + + +def test_create_table(started_cluster): + address = get_address_for_ch() + + # simple creation + drop_table('test_create_table') + node.query( + f""" + CREATE TABLE test_create_table( + k String, + v UInt32 + ) Engine=Redis('{address}') + """ + ) + + # simple creation with full engine args + drop_table('test_create_table') + node.query( + f""" + CREATE TABLE test_create_table( + k String, + v UInt32 + ) Engine=Redis('{address}', 0, 'clickhouse','simple', 10) + """ + ) + + drop_table('test_create_table') + node.query( + f""" + CREATE TABLE test_create_table( + k String, + f String, + v UInt32 + ) Engine=Redis('{address}', 0, 'clickhouse','hash_map', 10) + """ + ) + + # illegal columns + drop_table('test_create_table') + with pytest.raises(QueryRuntimeException): + node.query( + f""" + CREATE TABLE test_create_table( + k String, + f String, + v UInt32 + ) Engine=Redis('{address}', 0, 'clickhouse','simple', 10) + """ + ) + + drop_table('test_create_table') + with pytest.raises(QueryRuntimeException): + node.query( + f""" + CREATE TABLE test_create_table( + k String, + f String, + v UInt32, + n UInt32 + ) Engine=Redis('{address}', 0, 'clickhouse','hash_map', 10) + """ + ) + + # illegal storage type + drop_table('test_create_table') + with pytest.raises(QueryRuntimeException): + node.query( + f""" + CREATE TABLE test_create_table( + k String, + v UInt32 + ) Engine=Redis('{address}', 0, 'clickhouse','not_exist', 10) + """ + ) From 8c822a7edfb919844a94a46485dfeebbf9caad57 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Fri, 26 May 2023 10:34:37 +0800 Subject: [PATCH 0275/1072] add tests to redis engine --- src/Storages/RedisCommon.cpp | 6 +- src/Storages/RedisCommon.h | 2 +- src/Storages/StorageRedis.cpp | 2 +- src/TableFunctions/TableFunctionRedis.cpp | 40 +- src/TableFunctions/TableFunctionRedis.h | 1 + tests/integration/test_storage_redis/test.py | 9 +- .../configs_secure/config.d/ssl_conf.xml | 8 - .../test_table_function_redis/test.py | 385 ++++++------------ 8 files changed, 166 insertions(+), 287 deletions(-) delete mode 100644 tests/integration/test_table_function_redis/configs_secure/config.d/ssl_conf.xml diff --git a/src/Storages/RedisCommon.cpp b/src/Storages/RedisCommon.cpp index 0d33b9c7aa3..86312f49f41 100644 --- a/src/Storages/RedisCommon.cpp +++ b/src/Storages/RedisCommon.cpp @@ -1,5 +1,9 @@ #include "RedisCommon.h" #include +#include +#include +#include +#include namespace DB { @@ -169,7 +173,7 @@ RedisColumnType getRedisColumnType(RedisStorageType storage_type, const Names & } } -RedisConfiguration getRedisConfiguration(const ASTs & engine_args, ContextPtr context) +RedisConfiguration getRedisConfiguration(ASTs & engine_args, ContextPtr context) { RedisConfiguration configuration; configuration.db_index = 0; diff --git a/src/Storages/RedisCommon.h b/src/Storages/RedisCommon.h index 30e771d2471..c378006f7a5 100644 --- a/src/Storages/RedisCommon.h +++ b/src/Storages/RedisCommon.h @@ -84,7 +84,7 @@ RedisArrayPtr getRedisHashMapKeys(const RedisConnectionPtr & connection, RedisAr RedisColumnType getRedisColumnType(RedisStorageType storage_type, const Names & all_columns, const String & column); /// parse redis table engine/function configuration from engine_args -RedisConfiguration getRedisConfiguration(const ASTs & engine_args, ContextPtr context); +RedisConfiguration getRedisConfiguration(ASTs & engine_args, ContextPtr context); /// checking Redis table/table-function when creating void checkRedisTableStructure(const ColumnsDescription & columns, const RedisConfiguration & configuration); diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index 5010aada8c4..cd1cd06b4c4 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -168,7 +168,7 @@ void registerStorageRedis(StorageFactory & factory) "Redis", [](const StorageFactory::Arguments & args) { - auto configuration = StorageRedis::getConfiguration(args.engine_args, args.getLocalContext()); + auto configuration = getRedisConfiguration(args.engine_args, args.getLocalContext()); checkRedisTableStructure(args.columns, configuration); diff --git a/src/TableFunctions/TableFunctionRedis.cpp b/src/TableFunctions/TableFunctionRedis.cpp index cd08837aae2..f90a30af8a1 100644 --- a/src/TableFunctions/TableFunctionRedis.cpp +++ b/src/TableFunctions/TableFunctionRedis.cpp @@ -11,6 +11,7 @@ #include #include #include +#include namespace DB @@ -43,19 +44,6 @@ StoragePtr TableFunctionRedis::executeImpl( /// TODO support user customized table structure ColumnsDescription TableFunctionRedis::getActualTableStructure(ContextPtr context) const { - /// generate table structure by storage type. - String structure; - switch (configuration->storage_type) - { - case RedisStorageType::SIMPLE: - structure = "key String, value String"; - break; - case RedisStorageType::HASH_MAP: - structure = "key String, field String, value String"; - break; - case RedisStorageType::UNKNOWN: - throw Exception(ErrorCodes::INVALID_REDIS_STORAGE_TYPE, "Invalid Redis storage type."); - } return parseColumnsListFromString(structure, context); } @@ -66,15 +54,25 @@ void TableFunctionRedis::parseArguments(const ASTPtr & ast_function, ContextPtr throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table function 'redis' must have arguments."); ASTs & args = func_args.arguments->children; - - if (args.size() != 5) - { - throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Table function 'Redis' requires from 5 parameters: " - "redis('host:port', db_index, 'password', 'storage_type', 'pool_size')"); - } configuration = getRedisConfiguration(args, context); + + if (args.size() > 5) + structure = checkAndGetLiteralArgument(args[5], "structure"); + + if (structure.empty()) + { + switch (configuration->storage_type) + { + case RedisStorageType::SIMPLE: + structure = "key String, value String"; + break; + case RedisStorageType::HASH_MAP: + structure = "key String, field String, value String"; + break; + case RedisStorageType::UNKNOWN: + throw Exception(ErrorCodes::INVALID_REDIS_STORAGE_TYPE, "Invalid Redis storage type."); + } + } } diff --git a/src/TableFunctions/TableFunctionRedis.h b/src/TableFunctions/TableFunctionRedis.h index 5c6f483fda7..1328d54a2a6 100644 --- a/src/TableFunctions/TableFunctionRedis.h +++ b/src/TableFunctions/TableFunctionRedis.h @@ -24,6 +24,7 @@ private: void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; std::optional configuration; + String structure; }; } diff --git a/tests/integration/test_storage_redis/test.py b/tests/integration/test_storage_redis/test.py index af01c2e9ff1..d4fbdaddd7f 100644 --- a/tests/integration/test_storage_redis/test.py +++ b/tests/integration/test_storage_redis/test.py @@ -1,5 +1,6 @@ import time +## sudo -H pip install redis import redis import pytest @@ -61,11 +62,11 @@ def test_storage_simple_select(started_cluster): """ ) - response = TSV.toMat(node.query("SELECT k, v from test_storage_simple_select where k='0' FORMAT TSV")) + response = TSV.toMat(node.query("SELECT k, v FROM test_storage_simple_select WHERE k='0' FORMAT TSV")) assert (len(response) == 1) assert (response[0] == ['0', '0']) - response = TSV.toMat(node.query("SELECT * from test_storage_simple_select order by k FORMAT TSV")) + response = TSV.toMat(node.query("SELECT * FROM test_storage_simple_select ORDER BY k FORMAT TSV")) assert (len(response) == 100) assert (response[0] == ['0', '0']) @@ -97,11 +98,11 @@ def test_storage_hash_map_select(started_cluster): """ ) - response = TSV.toMat(node.query("SELECT k, f, v from test_storage_hash_map_select where f='0' FORMAT TSV")) + response = TSV.toMat(node.query("SELECT k, f, v FROM test_storage_hash_map_select WHERE f='0' FORMAT TSV")) assert (len(response) == 1) assert (response[0] == ['k', '0', '0']) - response = TSV.toMat(node.query("SELECT * from test_storage_hash_map_select FORMAT TSV")) + response = TSV.toMat(node.query("SELECT * FROM test_storage_hash_map_select ORDER BY f FORMAT TSV")) assert (len(response) == 100) assert (response[0] == ['k', '0', '0']) diff --git a/tests/integration/test_table_function_redis/configs_secure/config.d/ssl_conf.xml b/tests/integration/test_table_function_redis/configs_secure/config.d/ssl_conf.xml deleted file mode 100644 index 3efe98e7045..00000000000 --- a/tests/integration/test_table_function_redis/configs_secure/config.d/ssl_conf.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - none - - - diff --git a/tests/integration/test_table_function_redis/test.py b/tests/integration/test_table_function_redis/test.py index e0ad71b0079..e53022095c9 100644 --- a/tests/integration/test_table_function_redis/test.py +++ b/tests/integration/test_table_function_redis/test.py @@ -1,276 +1,159 @@ -import pymongo +import time +import redis import pytest -from helpers.client import QueryRuntimeException +from helpers.client import QueryRuntimeException from helpers.cluster import ClickHouseCluster +from helpers.test_tools import TSV + +cluster = ClickHouseCluster(__file__) + +node = cluster.add_instance("node", with_redis=True) @pytest.fixture(scope="module") -def started_cluster(request): +def started_cluster(): try: - cluster = ClickHouseCluster(__file__) - node = cluster.add_instance( - "node", - with_mongo=True, - main_configs=[ - "configs_secure/config.d/ssl_conf.xml", - ], - with_mongo_secure=request.param, - ) cluster.start() yield cluster finally: cluster.shutdown() -def get_mongo_connection(started_cluster, secure=False, with_credentials=True): - connection_str = "" - if with_credentials: - connection_str = "mongodb://root:clickhouse@localhost:{}".format( - started_cluster.mongo_port - ) - else: - connection_str = "mongodb://localhost:{}".format( - started_cluster.mongo_no_cred_port - ) - if secure: - connection_str += "/?tls=true&tlsAllowInvalidCertificates=true" - return pymongo.MongoClient(connection_str) - - -@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) -def test_simple_select(started_cluster): - mongo_connection = get_mongo_connection(started_cluster) - db = mongo_connection["test"] - db.add_user("root", "clickhouse") - simple_mongo_table = db["simple_table"] - - node = started_cluster.instances["node"] - for i in range(0, 100): - node.query( - "INSERT INTO FUNCTION mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String') (key, data) VALUES ({}, '{}')".format( - i, hex(i * i) - ) - ) - assert ( - node.query( - "SELECT COUNT() FROM mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String')" - ) - == "100\n" +def get_redis_connection(db_id=0): + client = redis.Redis( + host='localhost', port=cluster.redis_port, password="clickhouse", db=db_id ) - assert ( - node.query( - "SELECT sum(key) FROM mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String')" - ) - == str(sum(range(0, 100))) + "\n" - ) - assert ( - node.query( - "SELECT sum(key) FROM mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', 'key UInt64, data String')" - ) - == str(sum(range(0, 100))) + "\n" - ) - - assert ( - node.query( - "SELECT data from mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String') where key = 42" - ) - == hex(42 * 42) + "\n" - ) - simple_mongo_table.drop() + return client -@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) -def test_complex_data_type(started_cluster): - mongo_connection = get_mongo_connection(started_cluster) - db = mongo_connection["test"] - db.add_user("root", "clickhouse") - incomplete_mongo_table = db["complex_table"] - data = [] - for i in range(0, 100): - data.append({"key": i, "data": hex(i * i), "dict": {"a": i, "b": str(i)}}) - incomplete_mongo_table.insert_many(data) - - node = started_cluster.instances["node"] - - assert ( - node.query( - "SELECT COUNT() FROM mongodb('mongo1:27017', 'test', 'complex_table', 'root', 'clickhouse', structure='key UInt64, data String, dict Map(UInt64, String)')" - ) - == "100\n" - ) - assert ( - node.query( - "SELECT sum(key) FROM mongodb('mongo1:27017', 'test', 'complex_table', 'root', 'clickhouse', structure='key UInt64, data String, dict Map(UInt64, String)')" - ) - == str(sum(range(0, 100))) + "\n" - ) - - assert ( - node.query( - "SELECT data from mongodb('mongo1:27017', 'test', 'complex_table', 'root', 'clickhouse', structure='key UInt64, data String, dict Map(UInt64, String)') where key = 42" - ) - == hex(42 * 42) + "\n" - ) - incomplete_mongo_table.drop() +def get_address_for_ch(): + return cluster.redis_host + ':6379' -@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) -def test_incorrect_data_type(started_cluster): - mongo_connection = get_mongo_connection(started_cluster) - db = mongo_connection["test"] - db.add_user("root", "clickhouse") - strange_mongo_table = db["strange_table"] - data = [] - for i in range(0, 100): - data.append({"key": i, "data": hex(i * i), "aaaa": "Hello"}) - strange_mongo_table.insert_many(data) +def test_storage_simple(started_cluster): + client = get_redis_connection() + address = get_address_for_ch() - node = started_cluster.instances["node"] + # clean all + client.flushall() + data = {} + for i in range(100): + data[str(i)] = str(i) + + client.mset(data) + client.close() + + response = TSV.toMat(node.query( + f""" + SELECT + key, value + FROM + redis('{address}', 0, 'clickhouse') + WHERE + key='0' + FORMAT TSV + """)) + + assert (len(response) == 1) + assert (response[0] == ['0', '0']) + + response = TSV.toMat(node.query( + f""" + SELECT + * + FROM + redis('{address}', 0, 'clickhouse') + ORDER BY + key + FORMAT TSV + """)) + + assert (len(response) == 100) + assert (response[0] == ['0', '0']) + + +def test_storage_hash_map(started_cluster): + client = get_redis_connection() + address = get_address_for_ch() + + # clean all + client.flushall() + + key = 'k' + data = {} + for i in range(100): + data[str(i)] = str(i) + + client.hset(key, mapping=data) + client.close() + + response = TSV.toMat(node.query( + f""" + SELECT + key, field, value + FROM + redis('{address}', 0, 'clickhouse','hash_map') + WHERE + field='0' + FORMAT TSV + """)) + + assert (len(response) == 1) + assert (response[0] == ['k', '0', '0']) + + response = TSV.toMat(node.query( + f""" + SELECT + * + FROM + redis('{address}', 0, 'clickhouse','hash_map') + ORDER BY + field + FORMAT TSV + """)) + + assert (len(response) == 100) + assert (response[0] == ['k', '0', '0']) + + +def test_customized_table_structure(started_cluster): + address = get_address_for_ch() + + node.query( + f""" + SELECT + * + FROM + redis('{address}', 0, 'clickhouse', "simple", 10, "k String, v UInt8") + """) + + node.query( + f""" + SELECT + * + FROM + redis('{address}', 0, 'clickhouse', "hash_map", 10, "k String, f UInt8, v String") + """) + + # illegal columns with pytest.raises(QueryRuntimeException): node.query( - "SELECT aaaa FROM mongodb('mongo1:27017', 'test', 'strange_table', 'root', 'clickhouse', structure='key UInt64, data String')" - ) + f""" + SELECT + * + FROM + redis('{address}', 0, 'clickhouse', "hash_map", 10, "k String, v String") + """) - strange_mongo_table.drop() - - -@pytest.mark.parametrize("started_cluster", [True], indirect=["started_cluster"]) -def test_secure_connection(started_cluster): - mongo_connection = get_mongo_connection(started_cluster, secure=True) - db = mongo_connection["test"] - db.add_user("root", "clickhouse") - simple_mongo_table = db["simple_table"] - data = [] - for i in range(0, 100): - data.append({"key": i, "data": hex(i * i)}) - simple_mongo_table.insert_many(data) - - node = started_cluster.instances["node"] - - assert ( + # illegal data type + with pytest.raises(QueryRuntimeException): node.query( - "SELECT COUNT() FROM mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String', options='ssl=true')" - ) - == "100\n" - ) - assert ( - node.query( - "SELECT sum(key) FROM mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String', options='ssl=true')" - ) - == str(sum(range(0, 100))) + "\n" - ) - assert ( - node.query( - "SELECT sum(key) FROM mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', 'key UInt64, data String', 'ssl=true')" - ) - == str(sum(range(0, 100))) + "\n" - ) - - assert ( - node.query( - "SELECT data from mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String', options='ssl=true') where key = 42" - ) - == hex(42 * 42) + "\n" - ) - simple_mongo_table.drop() - - -@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) -def test_predefined_connection_configuration(started_cluster): - mongo_connection = get_mongo_connection(started_cluster) - db = mongo_connection["test"] - db.add_user("root", "clickhouse") - simple_mongo_table = db["simple_table"] - data = [] - for i in range(0, 100): - data.append({"key": i, "data": hex(i * i)}) - simple_mongo_table.insert_many(data) - - node = started_cluster.instances["node"] - assert ( - node.query( - "SELECT count() FROM mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String')" - ) - == "100\n" - ) - simple_mongo_table.drop() - - -@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) -def test_no_credentials(started_cluster): - mongo_connection = get_mongo_connection(started_cluster, with_credentials=False) - db = mongo_connection["test"] - simple_mongo_table = db["simple_table"] - data = [] - for i in range(0, 100): - data.append({"key": i, "data": hex(i * i)}) - simple_mongo_table.insert_many(data) - - node = started_cluster.instances["node"] - assert ( - node.query( - "SELECT count() FROM mongodb('mongo2:27017', 'test', 'simple_table', '', '', structure='key UInt64, data String')" - ) - == "100\n" - ) - simple_mongo_table.drop() - - -@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) -def test_auth_source(started_cluster): - mongo_connection = get_mongo_connection(started_cluster, with_credentials=False) - admin_db = mongo_connection["admin"] - admin_db.add_user( - "root", - "clickhouse", - roles=[{"role": "userAdminAnyDatabase", "db": "admin"}, "readWriteAnyDatabase"], - ) - simple_mongo_table = admin_db["simple_table"] - data = [] - for i in range(0, 50): - data.append({"key": i, "data": hex(i * i)}) - simple_mongo_table.insert_many(data) - db = mongo_connection["test"] - simple_mongo_table = db["simple_table"] - data = [] - for i in range(0, 100): - data.append({"key": i, "data": hex(i * i)}) - simple_mongo_table.insert_many(data) - - node = started_cluster.instances["node"] - - node.query_and_get_error( - "SELECT count() FROM mongodb('mongo2:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String')" - ) - - assert ( - node.query( - "SELECT count() FROM mongodb('mongo2:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data String', options='authSource=admin')" - ) - == "100\n" - ) - simple_mongo_table.drop() - - -@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) -def test_missing_columns(started_cluster): - mongo_connection = get_mongo_connection(started_cluster) - db = mongo_connection["test"] - db.add_user("root", "clickhouse") - simple_mongo_table = db["simple_table"] - data = [] - for i in range(0, 10): - data.append({"key": i, "data": hex(i * i)}) - for i in range(0, 10): - data.append({"key": i}) - simple_mongo_table.insert_many(data) - - node = started_cluster.instances["node"] - result = node.query( - "SELECT count() FROM mongodb('mongo1:27017', 'test', 'simple_table', 'root', 'clickhouse', structure='key UInt64, data Nullable(String)') WHERE isNull(data)" - ) - assert result == "10\n" - simple_mongo_table.drop() + f""" + SELECT + * + FROM + redis('{address}', 0, 'clickhouse', "simple", 10, "k Ss, v String") + """) From 357df40c8f6af947223fc54360340e86016e4eae Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Mon, 29 May 2023 15:22:29 +0800 Subject: [PATCH 0276/1072] fix tests --- src/Dictionaries/RedisSource.cpp | 8 ++- src/Storages/StorageRedis.cpp | 14 ++++- tests/integration/test_storage_redis/test.py | 1 + .../test_table_function_redis/test.py | 60 +++++++++++++++++++ 4 files changed, 78 insertions(+), 5 deletions(-) diff --git a/src/Dictionaries/RedisSource.cpp b/src/Dictionaries/RedisSource.cpp index 27125077c10..261242c627f 100644 --- a/src/Dictionaries/RedisSource.cpp +++ b/src/Dictionaries/RedisSource.cpp @@ -27,14 +27,18 @@ namespace DB const RedisStorageType & storage_type_, const DB::Block & sample_block, size_t max_block_size_) - : ISource(sample_block), max_block_size(max_block_size_)// TODO + : ISource(sample_block) + , connection(std::move(connection_)) + , keys(keys_) + , storage_type(storage_type_) + , max_block_size{max_block_size_} { RedisColumnTypes columns_types_; if (storage_type_ == RedisStorageType::HASH_MAP) columns_types_ = REDIS_HASH_MAP_COLUMN_TYPES; else columns_types_ = REDIS_SIMPLE_COLUMN_TYPES; - RedisSource(std::move(connection_), keys_, storage_type_, sample_block, columns_types_, max_block_size_); + description.init(sample_block); } RedisSource::RedisSource( diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index cd1cd06b4c4..e670012d060 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -75,9 +75,8 @@ Pipe StorageRedis::read( if (all_scan) { + /// TODO use scan to avoid performance issue RedisCommand command_for_keys("KEYS"); - /// generate keys by table name prefix -// command_for_keys << table_id.getTableName() + ":" + serializeStorageType(configuration.storage_type) + ":*"; command_for_keys << "*"; auto all_keys = connection->client->execute(command_for_keys); @@ -136,7 +135,16 @@ Pipe StorageRedis::read( RedisArray keys; for (size_t pos=begin; posat(pos).get()); + { + if (WhichDataType(*primary_key_data_type).isStringOrFixedString()) + { + keys.add(fields->at(pos).get()); + } + else + { + keys.add(toString(fields->at(pos))); /// TODO redis source deserialize + } + } if (configuration.storage_type == RedisStorageType::HASH_MAP) { diff --git a/tests/integration/test_storage_redis/test.py b/tests/integration/test_storage_redis/test.py index d4fbdaddd7f..19e7b4e5340 100644 --- a/tests/integration/test_storage_redis/test.py +++ b/tests/integration/test_storage_redis/test.py @@ -181,3 +181,4 @@ def test_create_table(started_cluster): """ ) + diff --git a/tests/integration/test_table_function_redis/test.py b/tests/integration/test_table_function_redis/test.py index e53022095c9..7c342690027 100644 --- a/tests/integration/test_table_function_redis/test.py +++ b/tests/integration/test_table_function_redis/test.py @@ -157,3 +157,63 @@ def test_customized_table_structure(started_cluster): FROM redis('{address}', 0, 'clickhouse', "simple", 10, "k Ss, v String") """) + + +def test_data_type(started_cluster): + client = get_redis_connection() + address = get_address_for_ch() + + # string + client.flushall() + client.set('0', '0') + + response = TSV.toMat(node.query( + f""" + SELECT + * + FROM + redis('{address}', 0, 'clickhouse', 'simple', 10, "k String, v UInt8") + WHERE + k='0' + FORMAT TSV + """)) + + assert (len(response) == 1) + assert (response[0] == ['0', '0']) + + # number + client.flushall() + client.set('0', '0') + + response = TSV.toMat(node.query( + f""" + SELECT + * + FROM + redis('{address}', 0, 'clickhouse', 'simple', 10, "k UInt8, v UInt8") + WHERE + k=0 + FORMAT TSV + """)) + + assert (len(response) == 1) + assert (response[0] == ['0', '0']) + + # datetime + client.flushall() + client.set('2023-06-01 00:00:00', '0') + + response = TSV.toMat(node.query( + f""" + SELECT + * + FROM + redis('{address}', 0, 'clickhouse', 'simple', 10, "k DateTime, v UInt8") + WHERE + k='2023-06-01 00:00:00' + FORMAT TSV + """)) + + # TODO open + # assert (len(response) == 1) + # assert (response[0] == ['2023-06-01 00:00:00', '0']) From f4f939162dcd3b6814a9e4a288f5e0a0538ae283 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Tue, 30 May 2023 20:31:23 +0800 Subject: [PATCH 0277/1072] new redis engine schema design --- src/Dictionaries/RedisDictionarySource.cpp | 4 +- src/Dictionaries/RedisSource.cpp | 58 +-- src/Dictionaries/RedisSource.h | 10 - src/Storages/KVStorageUtils.cpp | 2 +- src/Storages/RedisCommon.cpp | 91 +--- src/Storages/RedisCommon.h | 33 +- src/Storages/StorageFactory.h | 3 +- src/Storages/StorageRedis.cpp | 425 +++++++++++++----- src/Storages/StorageRedis.h | 43 +- src/TableFunctions/TableFunctionRedis.cpp | 60 ++- src/TableFunctions/TableFunctionRedis.h | 6 +- tests/integration/test_storage_redis/test.py | 101 +++-- .../test_table_function_redis/test.py | 141 +++--- 13 files changed, 535 insertions(+), 442 deletions(-) diff --git a/src/Dictionaries/RedisDictionarySource.cpp b/src/Dictionaries/RedisDictionarySource.cpp index d28b7528d23..c52c3425d1b 100644 --- a/src/Dictionaries/RedisDictionarySource.cpp +++ b/src/Dictionaries/RedisDictionarySource.cpp @@ -40,10 +40,10 @@ namespace DB { .host = host, .port = static_cast(port), - .db_index = config.getUInt(redis_config_prefix + ".db_index", 0), + .db_index = config.getUInt(redis_config_prefix + ".db_index", DEFAULT_REDIS_DB_INDEX), .password = config.getString(redis_config_prefix + ".password", ""), .storage_type = parseStorageType(config.getString(redis_config_prefix + ".storage_type", "")), - .pool_size = config.getUInt(redis_config_prefix + ".pool_size", 16), + .pool_size = config.getUInt(redis_config_prefix + ".pool_size", DEFAULT_REDIS_POOL_SIZE), }; return std::make_unique(dict_struct, configuration, sample_block); diff --git a/src/Dictionaries/RedisSource.cpp b/src/Dictionaries/RedisSource.cpp index 261242c627f..5d8a475cad4 100644 --- a/src/Dictionaries/RedisSource.cpp +++ b/src/Dictionaries/RedisSource.cpp @@ -21,7 +21,7 @@ namespace DB } - RedisSource::RedisSource( + RedisSource::RedisSource( RedisConnectionPtr connection_, const RedisArray & keys_, const RedisStorageType & storage_type_, @@ -32,28 +32,6 @@ namespace DB , keys(keys_) , storage_type(storage_type_) , max_block_size{max_block_size_} - { - RedisColumnTypes columns_types_; - if (storage_type_ == RedisStorageType::HASH_MAP) - columns_types_ = REDIS_HASH_MAP_COLUMN_TYPES; - else - columns_types_ = REDIS_SIMPLE_COLUMN_TYPES; - description.init(sample_block); - } - - RedisSource::RedisSource( - RedisConnectionPtr connection_, - const RedisArray & keys_, - const RedisStorageType & storage_type_, - const DB::Block & sample_block, - const RedisColumnTypes & columns_types_, - size_t max_block_size_) - : ISource(sample_block) - , connection(std::move(connection_)) - , keys(keys_) - , storage_type(storage_type_) - , max_block_size{max_block_size_} - , columns_types(columns_types_) { description.init(sample_block); } @@ -192,27 +170,15 @@ namespace DB const auto & primary_key = keys_array.get(0); for (size_t i = 0; i < values.size(); ++i) { - const auto & value = values.get(i); const auto & secondary_key = keys_array.get(i + 1); + const auto & value = values.get(i); /// null string means 'no value for requested key' if (!value.isNull()) { - for (size_t idx=0; idx serializeKeysToRawString(const ColumnWithTypeAndName & return result; } -/// In current implementation rocks db can have key with only one column. +/// In current implementation rocks db/redis can have key with only one column. size_t getPrimaryKeyPos(const Block & header, const Names & primary_key) { if (primary_key.size() != 1) diff --git a/src/Storages/RedisCommon.cpp b/src/Storages/RedisCommon.cpp index 86312f49f41..ba7c02fdac5 100644 --- a/src/Storages/RedisCommon.cpp +++ b/src/Storages/RedisCommon.cpp @@ -1,9 +1,7 @@ #include "RedisCommon.h" #include #include -#include #include -#include namespace DB { @@ -13,12 +11,10 @@ namespace ErrorCodes extern const int INVALID_REDIS_TABLE_STRUCTURE; extern const int INTERNAL_REDIS_ERROR; extern const int TIMEOUT_EXCEEDED; + extern const int BAD_ARGUMENTS; extern const int INVALID_REDIS_STORAGE_TYPE; } -RedisColumnTypes REDIS_HASH_MAP_COLUMN_TYPES = {RedisColumnType::KEY, RedisColumnType::FIELD, RedisColumnType::VALUE}; -RedisColumnTypes REDIS_SIMPLE_COLUMN_TYPES = {RedisColumnType::KEY, RedisColumnType::VALUE}; - RedisConnection::RedisConnection(RedisPoolPtr pool_, RedisClientPtr client_) : pool(std::move(pool_)), client(std::move(client_)) { @@ -153,89 +149,4 @@ RedisArrayPtr getRedisHashMapKeys(const RedisConnectionPtr & connection, RedisAr return hkeys; } -RedisColumnType getRedisColumnType(RedisStorageType storage_type, const Names & all_columns, const String & column) -{ - const String & redis_col_key = all_columns.at(0); - if (column == redis_col_key) - return RedisColumnType::KEY; - - if (storage_type == RedisStorageType::HASH_MAP) - { - const String & redis_col_field = all_columns.at(1); - if (column == redis_col_field) - return RedisColumnType::FIELD; - else - return RedisColumnType::VALUE; - } - else - { - return RedisColumnType::VALUE; - } -} - -RedisConfiguration getRedisConfiguration(ASTs & engine_args, ContextPtr context) -{ - RedisConfiguration configuration; - configuration.db_index = 0; - configuration.password = ""; - configuration.storage_type = RedisStorageType::SIMPLE; - configuration.pool_size = 10; - - if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, context)) - { - validateNamedCollection( - *named_collection, - ValidateKeysMultiset{"host", "port", "hostname", "password", "db_index", "storage_type", "pool_size"}, - {}); - - configuration.host = named_collection->getAny({"host", "hostname"}); - configuration.port = static_cast(named_collection->getOrDefault("port", 6379)); - if (engine_args.size() > 1) - configuration.password = named_collection->get("password"); - if (engine_args.size() > 2) - configuration.db_index = static_cast(named_collection->get("db_index")); - if (engine_args.size() > 3) - configuration.storage_type = parseStorageType(named_collection->get("storage_type")); - if (engine_args.size() > 4) - configuration.pool_size = static_cast(named_collection->get("pool_size")); - } - else - { - for (auto & engine_arg : engine_args) - engine_arg = evaluateConstantExpressionOrIdentifierAsLiteral(engine_arg, context); - - /// 6379 is the default Redis port. - auto parsed_host_port = parseAddress(checkAndGetLiteralArgument(engine_args[0], "host:port"), 6379); - - configuration.host = parsed_host_port.first; - configuration.port = parsed_host_port.second; - if (engine_args.size() > 1) - configuration.db_index = static_cast(checkAndGetLiteralArgument(engine_args[1], "db_index")); - if (engine_args.size() > 2) - configuration.password = checkAndGetLiteralArgument(engine_args[2], "password"); - if (engine_args.size() > 3) - configuration.storage_type = parseStorageType(checkAndGetLiteralArgument(engine_args[3], "storage_type")); - if (engine_args.size() > 4) - configuration.pool_size = static_cast(checkAndGetLiteralArgument(engine_args[4], "pool_size")); - } - - if (configuration.storage_type == RedisStorageType::UNKNOWN) - throw Exception(ErrorCodes::INVALID_REDIS_STORAGE_TYPE, "Invalid Redis storage type"); - - context->getRemoteHostFilter().checkHostAndPort(configuration.host, toString(configuration.port)); - return configuration; -} - -void checkRedisTableStructure(const ColumnsDescription & columns, const RedisConfiguration & configuration) -{ - /// TODO check data type - if (configuration.storage_type == RedisStorageType::HASH_MAP && columns.size() != 3) - throw Exception(ErrorCodes::INVALID_REDIS_TABLE_STRUCTURE, - "Redis hash table must have 3 columns, but found {}", columns.size()); - - if (configuration.storage_type == RedisStorageType::SIMPLE && columns.size() != 2) - throw Exception(ErrorCodes::INVALID_REDIS_TABLE_STRUCTURE, - "Redis string table must have 2 columns, but found {}", columns.size()); -} - } diff --git a/src/Storages/RedisCommon.h b/src/Storages/RedisCommon.h index c378006f7a5..cb551a9a11a 100644 --- a/src/Storages/RedisCommon.h +++ b/src/Storages/RedisCommon.h @@ -8,6 +8,7 @@ #include #include #include +#include namespace DB { @@ -21,20 +22,6 @@ enum class RedisStorageType UNKNOWN }; -enum class RedisColumnType -{ - /// Redis key - KEY, - /// Redis hash field - FIELD, - /// Redis value - VALUE -}; - -using RedisColumnTypes = std::vector; - -extern RedisColumnTypes REDIS_HASH_MAP_COLUMN_TYPES; -extern RedisColumnTypes REDIS_SIMPLE_COLUMN_TYPES; /// storage type to Redis key type String storageTypeToKeyType(RedisStorageType type); @@ -52,6 +39,10 @@ struct RedisConfiguration uint32_t pool_size; }; +static uint32_t DEFAULT_REDIS_DB_INDEX = 0; +static uint32_t DEFAULT_REDIS_POOL_SIZE = 16; +static String DEFAULT_REDIS_PASSWORD; + using RedisArray = Poco::Redis::Array; using RedisArrayPtr = std::shared_ptr; using RedisCommand = Poco::Redis::Command; @@ -61,6 +52,9 @@ using RedisClientPtr = std::unique_ptr; using RedisPool = BorrowedObjectPool; using RedisPoolPtr = std::shared_ptr; +/// Redis scan interator +using RedisIterator = int64_t; + struct RedisConnection { RedisConnection(RedisPoolPtr pool_, RedisClientPtr client_); @@ -78,15 +72,4 @@ RedisConnectionPtr getRedisConnection(RedisPoolPtr pool, const RedisConfiguratio /// eg: keys -> [key1, key2] and get [[key1, field1, field2], [key2, field1, field2]] RedisArrayPtr getRedisHashMapKeys(const RedisConnectionPtr & connection, RedisArray & keys); -/// Get RedisColumnType of a column, If storage_type is -/// SIMPLE: all_columns must have 2 items and the first one is Redis key the second one is value -/// HASH_MAP: all_columns must have 2 items and the first one is Redis key the second is field, the third is value. -RedisColumnType getRedisColumnType(RedisStorageType storage_type, const Names & all_columns, const String & column); - -/// parse redis table engine/function configuration from engine_args -RedisConfiguration getRedisConfiguration(ASTs & engine_args, ContextPtr context); - -/// checking Redis table/table-function when creating -void checkRedisTableStructure(const ColumnsDescription & columns, const RedisConfiguration & configuration); - } diff --git a/src/Storages/StorageFactory.h b/src/Storages/StorageFactory.h index 77309541374..f1c1c237393 100644 --- a/src/Storages/StorageFactory.h +++ b/src/Storages/StorageFactory.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -14,8 +15,6 @@ namespace DB { class Context; -class ASTCreateQuery; -class ASTStorage; struct StorageID; diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index e670012d060..ceed448b4a7 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -1,11 +1,9 @@ #include #include #include -#include #include #include -#include #include #include #include @@ -13,32 +11,142 @@ #include #include #include +#include +#include +#include +#include +#include namespace DB { namespace ErrorCodes { - extern const int INVALID_REDIS_STORAGE_TYPE; extern const int NOT_IMPLEMENTED; } +class RedisDataSource : public ISource +{ +public: + RedisDataSource( + StorageRedis & storage_, + const Block & header, + FieldVectorPtr keys_, + FieldVector::const_iterator begin_, + FieldVector::const_iterator end_, + const size_t max_block_size_) + : ISource(header) + , storage(storage_) + , primary_key_pos(getPrimaryKeyPos(header, storage.getPrimaryKey())) + , keys(keys_) + , begin(begin_) + , end(end_) + , it(begin) + , max_block_size(max_block_size_) + { + } + + RedisDataSource( + StorageRedis & storage_, + const Block & header, + const size_t max_block_size_, + const String & pattern_ = "*") + : ISource(header) + , storage(storage_) + , primary_key_pos(getPrimaryKeyPos(header, storage.getPrimaryKey())) + , iterator(-1) + , pattern(pattern_) + , max_block_size(max_block_size_) + { + } + + String getName() const override { return storage.getName(); } + + Chunk generate() override + { + if (keys) + return generateWithKeys(); + return generateFullScan(); + } + + Chunk generateWithKeys() + { + const auto & sample_block = getPort().getHeader(); + if (it >= end) + { + it = {}; + return {}; + } + + const auto & key_column_type = sample_block.getByName(storage.getPrimaryKey().at(0)).type; + auto raw_keys = serializeKeysToRawString(it, end, key_column_type, max_block_size); + return storage.getBySerializedKeys(raw_keys, nullptr); + } + + Chunk generateFullScan() + { + /// redis scan ending + if (iterator == 0) + return {}; + + RedisArray scan_keys; + RedisIterator next_iterator; + + std::tie(next_iterator, scan_keys) = storage.scan(iterator == -1 ? 0 : iterator, pattern, max_block_size); + iterator = next_iterator; + + /// redis scan can return nothing + if (scan_keys.isNull() || scan_keys.size() == 0) + return generateFullScan(); + + const auto & sample_block = getPort().getHeader(); + MutableColumns columns = sample_block.cloneEmptyColumns(); + + RedisArray values = storage.multiGet(scan_keys); + for (size_t i = 0; i(i).isNull(); i++) + { + fillColumns(scan_keys.get(i).value(), + values.get(i).value(), + primary_key_pos, sample_block, columns + ); + } + + Block block = sample_block.cloneWithColumns(std::move(columns)); + return Chunk(block.getColumns(), block.rows()); + } + +private: + StorageRedis & storage; + + size_t primary_key_pos; + + /// For key scan + FieldVectorPtr keys = nullptr; + FieldVector::const_iterator begin; + FieldVector::const_iterator end; + FieldVector::const_iterator it; + + /// For full scan + RedisIterator iterator; + String pattern; + + const size_t max_block_size; +}; + StorageRedis::StorageRedis( const StorageID & table_id_, const RedisConfiguration & configuration_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - const String & comment_) + ContextPtr context_, + const StorageInMemoryMetadata & storage_metadata, + const String & primary_key_) : IStorage(table_id_) + , WithContext(context_->getGlobalContext()) , table_id(table_id_) , configuration(configuration_) , log(&Poco::Logger::get("StorageRedis")) + , primary_key(primary_key_) { pool = std::make_shared(configuration.pool_size); - StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); - storage_metadata.setConstraints(constraints_); - storage_metadata.setComment(comment_); setInMemoryMetadata(storage_metadata); } @@ -46,84 +154,37 @@ Pipe StorageRedis::read( const Names & column_names, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, - ContextPtr context, + ContextPtr context_, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, size_t num_streams) { - auto connection = getRedisConnection(pool, configuration); storage_snapshot->check(column_names); - Block sample_block; - RedisColumnTypes redis_types; - auto all_columns = storage_snapshot->metadata->getColumns().getNamesOfPhysical(); - - for (const String & column_name : column_names) - { - auto column_data = storage_snapshot->metadata->getColumns().getPhysical(column_name); - sample_block.insert({column_data.type, column_data.name}); - redis_types.push_back(getRedisColumnType(configuration.storage_type, all_columns, column_name)); - } - - FieldVectorPtr fields; + FieldVectorPtr keys; bool all_scan = false; - String primary_key = all_columns.at(0); - auto primary_key_data_type = sample_block.getByName(primary_key).type; + Block header = storage_snapshot->metadata->getSampleBlock(); + auto primary_key_data_type = header.getByName(primary_key).type; - std::tie(fields, all_scan) = getFilterKeys(primary_key, primary_key_data_type, query_info, context); + std::tie(keys, all_scan) = getFilterKeys(primary_key, primary_key_data_type, query_info, context_); if (all_scan) { - /// TODO use scan to avoid performance issue - RedisCommand command_for_keys("KEYS"); - command_for_keys << "*"; - - auto all_keys = connection->client->execute(command_for_keys); - - if (all_keys.isNull() || all_keys.size() == 0) - return {}; - - Pipes pipes; - - size_t num_keys = all_keys.size(); - size_t num_threads = std::min(num_streams, all_keys.size()); - - num_threads = std::min(num_threads, configuration.pool_size); - assert(num_keys <= std::numeric_limits::max()); - - for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) - { - size_t begin = num_keys * thread_idx / num_threads; - size_t end = num_keys * (thread_idx + 1) / num_threads; - - RedisArray keys; - for (size_t pos=begin; pos(pos)); - - if (configuration.storage_type == RedisStorageType::HASH_MAP) - { - keys = *getRedisHashMapKeys(connection, keys); - } - - delete connection.release(); - - /// TODO reduce keys copy - pipes.emplace_back(std::make_shared( - getRedisConnection(pool, configuration), keys, - configuration.storage_type, sample_block, redis_types, max_block_size)); - } - return Pipe::unitePipes(std::move(pipes)); + return Pipe(std::make_shared(*this, header, max_block_size)); } else { - if (fields->empty()) + if (keys->empty()) return {}; Pipes pipes; - size_t num_keys = fields->size(); - size_t num_threads = std::min(num_streams, fields->size()); + ::sort(keys->begin(), keys->end()); + keys->erase(std::unique(keys->begin(), keys->end()), keys->end()); + + size_t num_keys = keys->size(); + size_t num_threads = std::min(num_streams, keys->size()); num_threads = std::min(num_threads, configuration.pool_size); assert(num_keys <= std::numeric_limits::max()); @@ -133,34 +194,191 @@ Pipe StorageRedis::read( size_t begin = num_keys * thread_idx / num_threads; size_t end = num_keys * (thread_idx + 1) / num_threads; - RedisArray keys; - for (size_t pos=begin; posat(pos).get()); - } - else - { - keys.add(toString(fields->at(pos))); /// TODO redis source deserialize - } - } - - if (configuration.storage_type == RedisStorageType::HASH_MAP) - { - keys = *getRedisHashMapKeys(connection, keys); - } - - delete connection.release(); - - pipes.emplace_back(std::make_shared( - getRedisConnection(pool, configuration), keys, - configuration.storage_type, sample_block, redis_types, max_block_size)); + pipes.emplace_back(std::make_shared( + *this, header, keys, keys->begin() + begin, keys->begin() + end, max_block_size)); } return Pipe::unitePipes(std::move(pipes)); } } +namespace +{ + // host:port, db_index, password, pool_size + RedisConfiguration getRedisConfiguration(ASTs & engine_args, ContextPtr context) + { + RedisConfiguration configuration; + + if (engine_args.size() < 1) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad arguments count when creating Redis table engine"); + + if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, context)) + { + validateNamedCollection( + *named_collection, + ValidateKeysMultiset{"host", "port", "hostname", "password", "db_index", "pool_size"}, + {}); + + configuration.host = named_collection->getAny({"host", "hostname"}); + configuration.port = static_cast(named_collection->getOrDefault("port", 6379)); + configuration.password = named_collection->getOrDefault("password", DEFAULT_REDIS_PASSWORD); + configuration.db_index = static_cast(named_collection->getOrDefault("db_index", DEFAULT_REDIS_DB_INDEX)); + configuration.pool_size = static_cast(named_collection->getOrDefault("pool_size", DEFAULT_REDIS_POOL_SIZE)); + } + else + { + for (auto & engine_arg : engine_args) + engine_arg = evaluateConstantExpressionOrIdentifierAsLiteral(engine_arg, context); + + /// 6379 is the default Redis port. + auto parsed_host_port = parseAddress(checkAndGetLiteralArgument(engine_args[0], "host:port"), 6379); + configuration.host = parsed_host_port.first; + configuration.port = parsed_host_port.second; + + if (engine_args.size() > 1) + configuration.db_index = static_cast(checkAndGetLiteralArgument(engine_args[1], "db_index")); + else + configuration.db_index = DEFAULT_REDIS_DB_INDEX; + if (engine_args.size() > 2) + configuration.password = checkAndGetLiteralArgument(engine_args[2], "password"); + else + configuration.password = DEFAULT_REDIS_PASSWORD; + if (engine_args.size() > 3) + configuration.pool_size = static_cast(checkAndGetLiteralArgument(engine_args[3], "pool_size")); + else + configuration.pool_size = DEFAULT_REDIS_POOL_SIZE; + } + + context->getRemoteHostFilter().checkHostAndPort(configuration.host, toString(configuration.port)); + return configuration; + } + + StoragePtr createStorageRedis(const StorageFactory::Arguments & args) + { + auto configuration = getRedisConfiguration(args.engine_args, args.getLocalContext()); + + StorageInMemoryMetadata metadata; + metadata.setColumns(args.columns); + metadata.setConstraints(args.constraints); + metadata.setComment(args.comment); + + if (!args.storage_def->primary_key) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "StorageRedis must require one column in primary key"); + + auto primary_key_desc = KeyDescription::getKeyFromAST(args.storage_def->primary_key->ptr(), metadata.columns, args.getContext()); + auto primary_key_names = primary_key_desc.expression->getRequiredColumns(); + + if (primary_key_names.size() != 1) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "StorageRedis must require one column in primary key"); + } + + return std::make_shared( + args.table_id, + configuration, + args.getContext(), + metadata, + primary_key_names[0]); + } +} + +Chunk StorageRedis::getBySerializedKeys( + const std::vector & keys, + PaddedPODArray * null_map) const +{ + RedisArray redis_keys; + for (const auto & key : keys) + redis_keys.add(key); + return getBySerializedKeys(redis_keys, null_map); +} + +Chunk StorageRedis::getBySerializedKeys( + const RedisArray & keys, + PaddedPODArray * null_map) const +{ + Block sample_block = getInMemoryMetadataPtr()->getSampleBlock(); + + size_t primary_key_pos = getPrimaryKeyPos(sample_block, getPrimaryKey()); + MutableColumns columns = sample_block.cloneEmptyColumns(); + + RedisArray values = multiGet(keys); + if (values.isNull() || values.size() == 0) + return {}; + + if (null_map) + { + null_map->clear(); + null_map->resize_fill(keys.size(), 1); + } + + for (size_t i = 0; i < values.size(); ++i) + { + if (!values.get(i).isNull()) + { + fillColumns(keys.get(i).value(), + values.get(i).value(), + primary_key_pos, sample_block, columns + ); + } + else /// key not found + { + if (null_map) + { + (*null_map)[i] = 0; + for (size_t col_idx = 0; col_idx < sample_block.columns(); ++col_idx) + { + columns[col_idx]->insert(sample_block.getByPosition(col_idx).type->getDefault()); + } + } + } + } + + size_t num_rows = columns.at(0)->size(); + return Chunk(std::move(columns), num_rows); +} + +std::pair StorageRedis::scan(RedisIterator iterator, const String & pattern, const uint64_t max_count) +{ + auto connection = getRedisConnection(pool, configuration); + RedisCommand scan("SCAN"); + scan << toString(iterator) << "MATCH" << pattern << "COUNT" << toString(max_count); + + const auto & result = connection->client->execute(scan); + RedisIterator next = parse(result.get(0).value()); + + return {next, result.get(1)}; +} + +RedisArray StorageRedis::multiGet(const RedisArray & keys) const +{ + auto connection = getRedisConnection(pool, configuration); + + RedisCommand cmd_mget("MGET"); + for (size_t i = 0; i < keys.size(); ++i) + cmd_mget.add(keys.get(i)); + + return connection->client->execute(cmd_mget); +} + +Chunk StorageRedis::getByKeys( + const ColumnsWithTypeAndName & keys, + PaddedPODArray & null_map, + const Names &) const +{ + if (keys.size() != 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "StorageRedis supports only one key, got: {}", keys.size()); + + auto raw_keys = serializeKeysToRawString(keys[0]); + + if (raw_keys.size() != keys[0].column->size()) + throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Assertion failed: {} != {}", raw_keys.size(), keys[0].column->size()); + + return getBySerializedKeys(raw_keys, &null_map); +} + +Block StorageRedis::getSampleBlock(const Names &) const +{ + return getInMemoryMetadataPtr()->getSampleBlock(); +} SinkToStoragePtr StorageRedis::write( const ASTPtr & /*query*/, @@ -172,24 +390,13 @@ SinkToStoragePtr StorageRedis::write( void registerStorageRedis(StorageFactory & factory) { - factory.registerStorage( - "Redis", - [](const StorageFactory::Arguments & args) - { - auto configuration = getRedisConfiguration(args.engine_args, args.getLocalContext()); + StorageFactory::StorageFeatures features{ + .supports_sort_order = true, + .supports_parallel_insert = true, + .source_access_type = AccessType::REDIS, + }; - checkRedisTableStructure(args.columns, configuration); - - return std::make_shared( - args.table_id, - configuration, - args.columns, - args.constraints, - args.comment); - }, - { - .source_access_type = AccessType::REDIS, - }); + factory.registerStorage("Redis", createStorageRedis, features); } } diff --git a/src/Storages/StorageRedis.h b/src/Storages/StorageRedis.h index 619a83f3851..4a0418e6091 100644 --- a/src/Storages/StorageRedis.h +++ b/src/Storages/StorageRedis.h @@ -2,27 +2,24 @@ #include #include -#include +#include +#include +#include namespace DB { /* Implements storage in the Redis. - * Use ENGINE = Redis(host:port, db_index, password, storage_type, pool_size); - * Read only. - * - * Note If storage_type is - * SIMPLE: there should be 2 columns and the first one is key in Redis, the second one is value. - * HASH_MAP: there should be 3 columns and the first one is key in Redis and the second is the field of Redis Map. + * Use ENGINE = Redis(host:port[, db_index[, password[, pool_size]]]) PRIMARY KEY(key); */ -class StorageRedis : public IStorage +class StorageRedis : public IStorage, public IKeyValueEntity, WithContext { public: StorageRedis( const StorageID & table_id_, const RedisConfiguration & configuration_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - const String & comment_); + ContextPtr context_, + const StorageInMemoryMetadata & storage_metadata, + const String & primary_key_); std::string getName() const override { return "Redis"; } @@ -30,7 +27,7 @@ public: const Names & column_names, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, - ContextPtr context, + ContextPtr context_, QueryProcessingStage::Enum processed_stage, size_t max_block_size, size_t num_streams) override; @@ -40,12 +37,34 @@ public: const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + Names getPrimaryKey() const override { return {primary_key}; } + + /// Return chunk with data for given serialized keys. + /// If out_null_map is passed, fill it with 1/0 depending on key was/wasn't found. Result chunk may contain default values. + /// If out_null_map is not passed. Not found rows excluded from result chunk. + Chunk getBySerializedKeys( + const std::vector & keys, + PaddedPODArray * out_null_map) const; + + Chunk getBySerializedKeys( + const RedisArray & keys, + PaddedPODArray * out_null_map) const; + + std::pair scan(RedisIterator iterator, const String & pattern, const uint64_t max_count); + + RedisArray multiGet(const RedisArray & keys) const; + + Chunk getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & null_map, const Names &) const override; + + Block getSampleBlock(const Names &) const override; private: StorageID table_id; RedisConfiguration configuration; Poco::Logger * log; RedisPoolPtr pool; + + const String primary_key; }; } diff --git a/src/TableFunctions/TableFunctionRedis.cpp b/src/TableFunctions/TableFunctionRedis.cpp index f90a30af8a1..3db174fbcd8 100644 --- a/src/TableFunctions/TableFunctionRedis.cpp +++ b/src/TableFunctions/TableFunctionRedis.cpp @@ -1,6 +1,7 @@ #include #include +#include #include @@ -12,6 +13,8 @@ #include #include #include +#include + namespace DB @@ -21,7 +24,6 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int INVALID_REDIS_STORAGE_TYPE; } @@ -29,19 +31,16 @@ StoragePtr TableFunctionRedis::executeImpl( const ASTPtr & /*ast_function*/, ContextPtr context, const String & table_name, ColumnsDescription /*cached_columns*/) const { auto columns = getActualTableStructure(context); - checkRedisTableStructure(columns, *configuration); + + StorageInMemoryMetadata metadata; + metadata.setColumns(columns); auto storage = std::make_shared( - StorageID(toString(configuration->db_index), table_name), - *configuration, - columns, - ConstraintsDescription(), - String{}); + StorageID(toString(configuration.db_index), table_name), configuration, context, metadata, primary_key); storage->startup(); return storage; } -/// TODO support user customized table structure ColumnsDescription TableFunctionRedis::getActualTableStructure(ContextPtr context) const { return parseColumnsListFromString(structure, context); @@ -54,25 +53,38 @@ void TableFunctionRedis::parseArguments(const ASTPtr & ast_function, ContextPtr throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table function 'redis' must have arguments."); ASTs & args = func_args.arguments->children; - configuration = getRedisConfiguration(args, context); + if (args.size() < 3) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad arguments count when creating Redis table function"); + + for (auto & arg : args) + arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); + + auto parsed_host_port = parseAddress(checkAndGetLiteralArgument(args[0], "host:port"), 6379); + configuration.host = parsed_host_port.first; + configuration.port = parsed_host_port.second; + + primary_key = checkAndGetLiteralArgument(args[1], "key"); + structure = checkAndGetLiteralArgument(args[2], "structure"); + + if (args.size() > 3) + configuration.db_index = static_cast(checkAndGetLiteralArgument(args[3], "db_index")); + else + configuration.db_index = DEFAULT_REDIS_DB_INDEX; + if (args.size() > 4) + configuration.password = checkAndGetLiteralArgument(args[4], "password"); + else + configuration.password = DEFAULT_REDIS_PASSWORD; if (args.size() > 5) - structure = checkAndGetLiteralArgument(args[5], "structure"); + configuration.pool_size = static_cast(checkAndGetLiteralArgument(args[5], "pool_size")); + else + configuration.pool_size = DEFAULT_REDIS_POOL_SIZE; - if (structure.empty()) - { - switch (configuration->storage_type) - { - case RedisStorageType::SIMPLE: - structure = "key String, value String"; - break; - case RedisStorageType::HASH_MAP: - structure = "key String, field String, value String"; - break; - case RedisStorageType::UNKNOWN: - throw Exception(ErrorCodes::INVALID_REDIS_STORAGE_TYPE, "Invalid Redis storage type."); - } - } + context->getRemoteHostFilter().checkHostAndPort(configuration.host, toString(configuration.port)); + + auto columns = parseColumnsListFromString(structure, context); + if (!columns.has(primary_key)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad arguments redis table function structure should contains key."); } diff --git a/src/TableFunctions/TableFunctionRedis.h b/src/TableFunctions/TableFunctionRedis.h index 1328d54a2a6..b985a89e3d7 100644 --- a/src/TableFunctions/TableFunctionRedis.h +++ b/src/TableFunctions/TableFunctionRedis.h @@ -7,6 +7,9 @@ namespace DB { +/* Implements Redis table function. + * Use redis(host:port, key, structure[, db_index[, password[, pool_size]]]); + */ class TableFunctionRedis : public ITableFunction { public: @@ -23,8 +26,9 @@ private: ColumnsDescription getActualTableStructure(ContextPtr context) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; - std::optional configuration; + RedisConfiguration configuration; String structure; + String primary_key; }; } diff --git a/tests/integration/test_storage_redis/test.py b/tests/integration/test_storage_redis/test.py index 19e7b4e5340..1f65a9df2f3 100644 --- a/tests/integration/test_storage_redis/test.py +++ b/tests/integration/test_storage_redis/test.py @@ -3,6 +3,8 @@ import time ## sudo -H pip install redis import redis import pytest +import struct +import sys from helpers.client import QueryRuntimeException from helpers.cluster import ClickHouseCluster @@ -37,17 +39,50 @@ def drop_table(table): node.query(f"DROP TABLE IF EXISTS {table} SYNC"); -def test_storage_simple_select(started_cluster): +# see SerializationString.serializeBinary +def serialize_binary_for_string(x): + var_uint_max = (1 << 63) - 1 + buf = bytearray() + # write length + length = len(x) + # length = (length << 1) ^ (length >> 63) + if length > var_uint_max: + raise ValueError("Value too large for varint encoding") + for i in range(9): + byte = length & 0x7F + if length > 0x7F: + byte |= 0x80 + buf += (bytes([byte])) + length >>= 7 + if not length: + break + # write data + buf += x.encode('utf-8') + return bytes(buf) + + +# see SerializationNumber.serializeBinary +def serialize_binary_for_uint32(x): + buf = bytearray() + packed_num = struct.pack('I', x) + buf += packed_num + if sys.byteorder != 'little': + buf.reverse() + return bytes(buf) + + +def test_simple_select(started_cluster): client = get_redis_connection() address = get_address_for_ch() # clean all client.flushall() - drop_table('test_storage_simple_select') + drop_table('test_simple_select') data = {} for i in range(100): - data[str(i)] = str(i) + packed = serialize_binary_for_string(str(i)) + data[packed] = packed client.mset(data) client.close() @@ -55,56 +90,55 @@ def test_storage_simple_select(started_cluster): # create table node.query( f""" - CREATE TABLE test_storage_simple_select( + CREATE TABLE test_simple_select( k String, - v UInt32 - ) Engine=Redis('{address}', 0, 'clickhouse') + v String + ) Engine=Redis('{address}', 0, 'clickhouse') PRIMARY KEY (k) """ ) - response = TSV.toMat(node.query("SELECT k, v FROM test_storage_simple_select WHERE k='0' FORMAT TSV")) + response = TSV.toMat(node.query("SELECT k, v FROM test_simple_select WHERE k='0' FORMAT TSV")) assert (len(response) == 1) assert (response[0] == ['0', '0']) - response = TSV.toMat(node.query("SELECT * FROM test_storage_simple_select ORDER BY k FORMAT TSV")) + response = TSV.toMat(node.query("SELECT * FROM test_simple_select ORDER BY k FORMAT TSV")) assert (len(response) == 100) assert (response[0] == ['0', '0']) -def test_storage_hash_map_select(started_cluster): +def test_select_int(started_cluster): client = get_redis_connection() address = get_address_for_ch() # clean all client.flushall() - drop_table('test_storage_hash_map_select') + drop_table('test_select_int') - key = 'k' data = {} for i in range(100): - data[str(i)] = str(i) + packed = serialize_binary_for_uint32(i) + data[packed] = packed - client.hset(key, mapping=data) + client.mset(data) client.close() # create table node.query( f""" - CREATE TABLE test_storage_hash_map_select( - k String, - f String, + CREATE TABLE test_select_int( + k UInt32, v UInt32 - ) Engine=Redis('{address}', 0, 'clickhouse','hash_map') + ) Engine=Redis('{address}', 0, 'clickhouse') PRIMARY KEY (k) """ ) - response = TSV.toMat(node.query("SELECT k, f, v FROM test_storage_hash_map_select WHERE f='0' FORMAT TSV")) + response = TSV.toMat(node.query("SELECT k, v FROM test_select_int WHERE k=0 FORMAT TSV")) assert (len(response) == 1) - assert (response[0] == ['k', '0', '0']) + assert (response[0] == ['0', '0']) - response = TSV.toMat(node.query("SELECT * FROM test_storage_hash_map_select ORDER BY f FORMAT TSV")) + response = TSV.toMat(node.query("SELECT * FROM test_select_int ORDER BY k FORMAT TSV")) assert (len(response) == 100) - assert (response[0] == ['k', '0', '0']) + assert (response[0] == ['0', '0']) def test_create_table(started_cluster): @@ -117,7 +151,7 @@ def test_create_table(started_cluster): CREATE TABLE test_create_table( k String, v UInt32 - ) Engine=Redis('{address}') + ) Engine=Redis('{address}') PRIMARY KEY (k) """ ) @@ -128,7 +162,7 @@ def test_create_table(started_cluster): CREATE TABLE test_create_table( k String, v UInt32 - ) Engine=Redis('{address}', 0, 'clickhouse','simple', 10) + ) Engine=Redis('{address}', 0, 'clickhouse', 10) PRIMARY KEY (k) """ ) @@ -139,11 +173,10 @@ def test_create_table(started_cluster): k String, f String, v UInt32 - ) Engine=Redis('{address}', 0, 'clickhouse','hash_map', 10) + ) Engine=Redis('{address}', 0, 'clickhouse', 10) PRIMARY KEY (k) """ ) - # illegal columns drop_table('test_create_table') with pytest.raises(QueryRuntimeException): node.query( @@ -152,7 +185,7 @@ def test_create_table(started_cluster): k String, f String, v UInt32 - ) Engine=Redis('{address}', 0, 'clickhouse','simple', 10) + ) Engine=Redis('{address}', 0, 'clickhouse', 10) PRIMARY KEY () """ ) @@ -163,22 +196,8 @@ def test_create_table(started_cluster): CREATE TABLE test_create_table( k String, f String, - v UInt32, - n UInt32 - ) Engine=Redis('{address}', 0, 'clickhouse','hash_map', 10) - """ - ) - - # illegal storage type - drop_table('test_create_table') - with pytest.raises(QueryRuntimeException): - node.query( - f""" - CREATE TABLE test_create_table( - k String, v UInt32 - ) Engine=Redis('{address}', 0, 'clickhouse','not_exist', 10) + ) Engine=Redis('{address}', 0, 'clickhouse', 10) """ ) - diff --git a/tests/integration/test_table_function_redis/test.py b/tests/integration/test_table_function_redis/test.py index 7c342690027..111276ec6dc 100644 --- a/tests/integration/test_table_function_redis/test.py +++ b/tests/integration/test_table_function_redis/test.py @@ -1,7 +1,9 @@ -import time +import datetime import redis import pytest +import sys +import struct from helpers.client import QueryRuntimeException from helpers.cluster import ClickHouseCluster @@ -32,7 +34,39 @@ def get_address_for_ch(): return cluster.redis_host + ':6379' -def test_storage_simple(started_cluster): +# see SerializationString.serializeBinary +def serialize_binary_for_string(x): + var_uint_max = (1 << 63) - 1 + buf = bytearray() + # write length + length = len(x) + # length = (length << 1) ^ (length >> 63) + if length > var_uint_max: + raise ValueError("Value too large for varint encoding") + for i in range(9): + byte = length & 0x7F + if length > 0x7F: + byte |= 0x80 + buf += (bytes([byte])) + length >>= 7 + if not length: + break + # write data + buf += x.encode('utf-8') + return bytes(buf) + + +# see SerializationNumber.serializeBinary +def serialize_binary_for_uint32(x): + buf = bytearray() + packed_num = struct.pack('I', x) + buf += packed_num + if sys.byteorder != 'little': + buf.reverse() + return bytes(buf) + + +def test_simple_select(started_cluster): client = get_redis_connection() address = get_address_for_ch() @@ -41,7 +75,8 @@ def test_storage_simple(started_cluster): data = {} for i in range(100): - data[str(i)] = str(i) + packed = serialize_binary_for_string(str(i)) + data[packed] = packed client.mset(data) client.close() @@ -51,7 +86,7 @@ def test_storage_simple(started_cluster): SELECT key, value FROM - redis('{address}', 0, 'clickhouse') + redis('{address}', 'key', 'key String, value String', 0, 'clickhouse', 10) WHERE key='0' FORMAT TSV @@ -65,7 +100,7 @@ def test_storage_simple(started_cluster): SELECT * FROM - redis('{address}', 0, 'clickhouse') + redis('{address}', 'key', 'key String, value String', 0, 'clickhouse', 10) ORDER BY key FORMAT TSV @@ -75,79 +110,22 @@ def test_storage_simple(started_cluster): assert (response[0] == ['0', '0']) -def test_storage_hash_map(started_cluster): +def test_create_table(started_cluster): client = get_redis_connection() address = get_address_for_ch() # clean all client.flushall() - - key = 'k' - data = {} - for i in range(100): - data[str(i)] = str(i) - - client.hset(key, mapping=data) client.close() - response = TSV.toMat(node.query( - f""" - SELECT - key, field, value - FROM - redis('{address}', 0, 'clickhouse','hash_map') - WHERE - field='0' - FORMAT TSV - """)) - - assert (len(response) == 1) - assert (response[0] == ['k', '0', '0']) - - response = TSV.toMat(node.query( - f""" - SELECT - * - FROM - redis('{address}', 0, 'clickhouse','hash_map') - ORDER BY - field - FORMAT TSV - """)) - - assert (len(response) == 100) - assert (response[0] == ['k', '0', '0']) - - -def test_customized_table_structure(started_cluster): - address = get_address_for_ch() - node.query( f""" SELECT * FROM - redis('{address}', 0, 'clickhouse', "simple", 10, "k String, v UInt8") + redis('{address}', 'k', 'k String, v UInt32', 0, 'clickhouse', 10) """) - node.query( - f""" - SELECT - * - FROM - redis('{address}', 0, 'clickhouse', "hash_map", 10, "k String, f UInt8, v String") - """) - - # illegal columns - with pytest.raises(QueryRuntimeException): - node.query( - f""" - SELECT - * - FROM - redis('{address}', 0, 'clickhouse', "hash_map", 10, "k String, v String") - """) - # illegal data type with pytest.raises(QueryRuntimeException): node.query( @@ -155,7 +133,17 @@ def test_customized_table_structure(started_cluster): SELECT * FROM - redis('{address}', 0, 'clickhouse', "simple", 10, "k Ss, v String") + redis('{address}', 'k', 'k not_exist_type, v String', 0, 'clickhouse', 10) + """) + + # illegal key + with pytest.raises(QueryRuntimeException): + node.query( + f""" + SELECT + * + FROM + redis('{address}', 'not_exist_key', 'k not_exist_type, v String', 0, 'clickhouse', 10) """) @@ -165,14 +153,15 @@ def test_data_type(started_cluster): # string client.flushall() - client.set('0', '0') + value = serialize_binary_for_string('0') + client.set(value, value) response = TSV.toMat(node.query( f""" SELECT * FROM - redis('{address}', 0, 'clickhouse', 'simple', 10, "k String, v UInt8") + redis('{address}', 'k', 'k String, v String', 0, 'clickhouse', 10) WHERE k='0' FORMAT TSV @@ -183,14 +172,15 @@ def test_data_type(started_cluster): # number client.flushall() - client.set('0', '0') + value = serialize_binary_for_uint32(0) + client.set(value, value) response = TSV.toMat(node.query( f""" SELECT * FROM - redis('{address}', 0, 'clickhouse', 'simple', 10, "k UInt8, v UInt8") + redis('{address}', 'k', 'k UInt32, v UInt32', 0, 'clickhouse', 10) WHERE k=0 FORMAT TSV @@ -201,19 +191,22 @@ def test_data_type(started_cluster): # datetime client.flushall() - client.set('2023-06-01 00:00:00', '0') + # clickhouse store datatime as uint32 in internal + dt = datetime.datetime(2023, 6, 1, 0, 0, 0) + seconds_since_epoch = dt.timestamp() + value = serialize_binary_for_uint32(int(seconds_since_epoch)) + client.set(value, value) response = TSV.toMat(node.query( f""" SELECT * FROM - redis('{address}', 0, 'clickhouse', 'simple', 10, "k DateTime, v UInt8") + redis('{address}', 'k', 'k DateTime, v DateTime', 0, 'clickhouse', 10) WHERE k='2023-06-01 00:00:00' FORMAT TSV """)) - # TODO open - # assert (len(response) == 1) - # assert (response[0] == ['2023-06-01 00:00:00', '0']) + assert (len(response) == 1) + assert (response[0] == ['2023-06-01 00:00:00', '2023-06-01 00:00:00']) From 1df1dfc3e54b9e9b2b0f05a516ffc83ff3147c76 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Wed, 31 May 2023 14:31:06 +0800 Subject: [PATCH 0278/1072] add update/delete/insert to redis storage --- src/Storages/RedisCommon.h | 4 +- src/Storages/StorageRedis.cpp | 198 ++++++++++++++++++- src/Storages/StorageRedis.h | 14 +- tests/integration/test_storage_redis/test.py | 135 ++++++++++++- 4 files changed, 344 insertions(+), 7 deletions(-) diff --git a/src/Storages/RedisCommon.h b/src/Storages/RedisCommon.h index cb551a9a11a..49c21c3277f 100644 --- a/src/Storages/RedisCommon.h +++ b/src/Storages/RedisCommon.h @@ -43,10 +43,12 @@ static uint32_t DEFAULT_REDIS_DB_INDEX = 0; static uint32_t DEFAULT_REDIS_POOL_SIZE = 16; static String DEFAULT_REDIS_PASSWORD; +using RedisCommand = Poco::Redis::Command; using RedisArray = Poco::Redis::Array; using RedisArrayPtr = std::shared_ptr; -using RedisCommand = Poco::Redis::Command; using RedisBulkString = Poco::Redis::BulkString; +using RedisSimpleString = String; +using RedisInteger = Int64; using RedisClientPtr = std::unique_ptr; using RedisPool = BorrowedObjectPool; diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index ceed448b4a7..f9a25470e2d 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -13,9 +13,12 @@ #include #include #include +#include #include #include #include +#include +#include namespace DB { @@ -23,6 +26,7 @@ namespace DB namespace ErrorCodes { extern const int NOT_IMPLEMENTED; + extern const int INTERNAL_REDIS_ERROR; } class RedisDataSource : public ISource @@ -133,6 +137,64 @@ private: const size_t max_block_size; }; + +class RedisSink : public SinkToStorage +{ +public: + RedisSink( + StorageRedis & storage_, + const StorageMetadataPtr & metadata_snapshot_); + + void consume(Chunk chunk) override; + String getName() const override { return "RedisSink"; } + +private: + StorageRedis & storage; + StorageMetadataPtr metadata_snapshot; + size_t primary_key_pos = 0; +}; + +RedisSink::RedisSink( + StorageRedis & storage_, + const StorageMetadataPtr & metadata_snapshot_) + : SinkToStorage(metadata_snapshot_->getSampleBlock()) + , storage(storage_) + , metadata_snapshot(metadata_snapshot_) +{ + for (const auto & column : getHeader()) + { + if (column.name == storage.getPrimaryKey()[0]) + break; + ++primary_key_pos; + } +} + +void RedisSink::consume(Chunk chunk) +{ + auto rows = chunk.getNumRows(); + auto block = getHeader().cloneWithColumns(chunk.detachColumns()); + + WriteBufferFromOwnString wb_key; + WriteBufferFromOwnString wb_value; + + RedisArray data; + for (size_t i = 0; i < rows; ++i) + { + wb_key.restart(); + wb_value.restart(); + + size_t idx = 0; + for (const auto & elem : block) + { + elem.type->getDefaultSerialization()->serializeBinary(*elem.column, i, idx == primary_key_pos ? wb_key : wb_value, {}); + ++idx; + } + data.add(wb_key.str()); + data.add(wb_value.str()); + } + storage.multiSet(data); +} + StorageRedis::StorageRedis( const StorageID & table_id_, const RedisConfiguration & configuration_, @@ -336,7 +398,7 @@ Chunk StorageRedis::getBySerializedKeys( return Chunk(std::move(columns), num_rows); } -std::pair StorageRedis::scan(RedisIterator iterator, const String & pattern, const uint64_t max_count) +std::pair StorageRedis::scan(RedisIterator iterator, const String & pattern, uint64_t max_count) { auto connection = getRedisConnection(pool, configuration); RedisCommand scan("SCAN"); @@ -359,6 +421,36 @@ RedisArray StorageRedis::multiGet(const RedisArray & keys) const return connection->client->execute(cmd_mget); } +void StorageRedis::multiSet(const RedisArray & data) const +{ + auto connection = getRedisConnection(pool, configuration); + + RedisCommand cmd_mget("MSET"); + for (size_t i = 0; i < data.size(); ++i) + cmd_mget.add(data.get(i)); + + auto ret = connection->client->execute(cmd_mget); + if (ret != "OK") + throw Exception(ErrorCodes::INTERNAL_REDIS_ERROR, "Fail to write to redis table {}, for {}", + table_id.getFullNameNotQuoted(), ret); +} + +RedisInteger StorageRedis::multiDelete(const RedisArray & keys) const +{ + auto connection = getRedisConnection(pool, configuration); + + RedisCommand cmd("DEL"); + for (size_t i = 0; i < keys.size(); ++i) + cmd.add(keys.get(i)); + + auto ret = connection->client->execute(cmd); + if (ret != static_cast(keys.size())) + LOG_DEBUG(log, "Try to delete {} rows but actually deleted {} rows from redis table {}.", + keys.size(), ret, table_id.getFullNameNotQuoted()); + + return ret; +} + Chunk StorageRedis::getByKeys( const ColumnsWithTypeAndName & keys, PaddedPODArray & null_map, @@ -382,10 +474,110 @@ Block StorageRedis::getSampleBlock(const Names &) const SinkToStoragePtr StorageRedis::write( const ASTPtr & /*query*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageMetadataPtr & metadata_snapshot, ContextPtr /*context*/) { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Write is unsupported for StorageRedis"); + return std::make_shared(*this, metadata_snapshot); +} + +/// TODO use scan to reduce latency +void StorageRedis::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &) +{ + auto connection = getRedisConnection(pool, configuration); + + RedisCommand cmd_flush_db("FLUSHDB"); + cmd_flush_db << toString(configuration.db_index); + auto ret = connection->client->execute(cmd_flush_db); + + if (ret.isNull() || ret.value() != "OK") + throw Exception(ErrorCodes::INTERNAL_REDIS_ERROR, "Fail to truncate redis table {}, for {}", table_id.getFullNameNotQuoted(), ret.value()); +} + +void StorageRedis::checkMutationIsPossible(const MutationCommands & commands, const Settings & /* settings */) const +{ + if (commands.empty()) + return; + + if (commands.size() > 1) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Mutations cannot be combined for StorageRedis"); + + const auto command_type = commands.front().type; + if (command_type != MutationCommand::Type::UPDATE && command_type != MutationCommand::Type::DELETE) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only DELETE and UPDATE mutation supported for StorageRedis"); +} + +void StorageRedis::mutate(const MutationCommands & commands, ContextPtr context_) +{ + if (commands.empty()) + return; + + assert(commands.size() == 1); + + auto metadata_snapshot = getInMemoryMetadataPtr(); + auto storage = getStorageID(); + auto storage_ptr = DatabaseCatalog::instance().getTable(storage, context_); + + if (commands.front().type == MutationCommand::Type::DELETE) + { + auto interpreter = std::make_unique( + storage_ptr, + metadata_snapshot, + commands, + context_, + /*can_execute_*/ true, + /*return_all_columns_*/ true, + /*return_mutated_rows*/ true); + auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute()); + PullingPipelineExecutor executor(pipeline); + + auto sink = std::make_shared(*this, metadata_snapshot); + + auto header = interpreter->getUpdatedHeader(); + auto primary_key_pos = header.getPositionByName(primary_key); + + Block block; + while (executor.pull(block)) + { + auto & column_type_name = block.getByPosition(primary_key_pos); + + auto column = column_type_name.column; + auto size = column->size(); + + RedisArray keys; + WriteBufferFromOwnString wb_key; + for (size_t i = 0; i < size; ++i) + { + wb_key.restart(); + column_type_name.type->getDefaultSerialization()->serializeBinary(*column, i, wb_key, {}); + keys.add(wb_key.str()); + } + multiDelete(keys); + } + return; + } + + assert(commands.front().type == MutationCommand::Type::UPDATE); + if (commands.front().column_to_update_expression.contains(primary_key)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Primary key cannot be updated (cannot update column {})", primary_key); + + auto interpreter = std::make_unique( + storage_ptr, + metadata_snapshot, + commands, + context_, + /*can_execute_*/ true, + /*return_all_columns*/ true, + /*return_mutated_rows*/ true); + auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute()); + PullingPipelineExecutor executor(pipeline); + + auto sink = std::make_shared(*this, metadata_snapshot); + + Block block; + while (executor.pull(block)) + { + sink->consume(Chunk{block.getColumns(), block.rows()}); + } } void registerStorageRedis(StorageFactory & factory) diff --git a/src/Storages/StorageRedis.h b/src/Storages/StorageRedis.h index 4a0418e6091..a4ab9a6aa4e 100644 --- a/src/Storages/StorageRedis.h +++ b/src/Storages/StorageRedis.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB { @@ -37,6 +38,14 @@ public: const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + void truncate(const ASTPtr &, + const StorageMetadataPtr & metadata_snapshot, + ContextPtr, + TableExclusiveLockHolder &) override; + + void checkMutationIsPossible(const MutationCommands & commands, const Settings & settings) const override; + void mutate(const MutationCommands &, ContextPtr) override; + Names getPrimaryKey() const override { return {primary_key}; } /// Return chunk with data for given serialized keys. @@ -50,13 +59,16 @@ public: const RedisArray & keys, PaddedPODArray * out_null_map) const; - std::pair scan(RedisIterator iterator, const String & pattern, const uint64_t max_count); + std::pair scan(RedisIterator iterator, const String & pattern, uint64_t max_count); RedisArray multiGet(const RedisArray & keys) const; + void multiSet(const RedisArray & data) const; + RedisInteger multiDelete(const RedisArray & keys) const; Chunk getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & null_map, const Names &) const override; Block getSampleBlock(const Names &) const override; + private: StorageID table_id; RedisConfiguration configuration; diff --git a/tests/integration/test_storage_redis/test.py b/tests/integration/test_storage_redis/test.py index 1f65a9df2f3..e77de99c649 100644 --- a/tests/integration/test_storage_redis/test.py +++ b/tests/integration/test_storage_redis/test.py @@ -1,5 +1,3 @@ -import time - ## sudo -H pip install redis import redis import pytest @@ -201,3 +199,136 @@ def test_create_table(started_cluster): """ ) + +def test_simple_insert(started_cluster): + client = get_redis_connection() + address = get_address_for_ch() + + # clean all + client.flushall() + drop_table('test_simple_insert') + + node.query( + f""" + CREATE TABLE test_simple_insert( + k UInt32, + m DateTime, + n String + ) Engine=Redis('{address}', 0, 'clickhouse') PRIMARY KEY (k) + """ + ) + + node.query( + """ + INSERT INTO test_simple_insert Values + (1, '2023-06-01 00:00:00', 'lili'), (2, '2023-06-02 00:00:00', 'lucy') + """ + ) + + response = node.query("SELECT COUNT(*) FROM test_simple_insert FORMAT Values") + assert (response == '(2)') + + response = TSV.toMat(node.query("SELECT k, m, n FROM test_simple_insert WHERE k=1 FORMAT TSV")) + assert (len(response) == 1) + assert (response[0] == ['1', '2023-06-01 00:00:00', 'lili']) + + response = TSV.toMat(node.query("SELECT k, m, n FROM test_simple_insert WHERE m='2023-06-01 00:00:00' FORMAT TSV")) + assert (len(response) == 1) + assert (response[0] == ['1', '2023-06-01 00:00:00', 'lili']) + + response = TSV.toMat(node.query("SELECT k, m, n FROM test_simple_insert WHERE n='lili' FORMAT TSV")) + assert (len(response) == 1) + assert (response[0] == ['1', '2023-06-01 00:00:00', 'lili']) + + +def test_update(started_cluster): + client = get_redis_connection() + address = get_address_for_ch() + # clean all + client.flushall() + drop_table('test_update') + + node.query( + f""" + CREATE TABLE test_update( + k UInt32, + m DateTime, + n String + ) Engine=Redis('{address}', 0, 'clickhouse') PRIMARY KEY (k) + """ + ) + + node.query( + """ + INSERT INTO test_update Values + (1, '2023-06-01 00:00:00', 'lili'), (2, '2023-06-02 00:00:00', 'lucy') + """ + ) + + response = node.query( + """ + ALTER TABLE test_update UPDATE m='2023-06-03 00:00:00' WHERE k=1 + """ + ) + + print("update response: ", response) + + response = TSV.toMat(node.query("SELECT k, m, n FROM test_update WHERE k=1 FORMAT TSV")) + assert (len(response) == 1) + assert (response[0] == ['1', '2023-06-03 00:00:00', 'lili']) + + # can not update key + with pytest.raises(QueryRuntimeException): + node.query( + """ + ALTER TABLE test_update UPDATE k=2 WHERE k=1 + """ + ) + + +def test_delete(started_cluster): + client = get_redis_connection() + address = get_address_for_ch() + + # clean all + client.flushall() + drop_table('test_delete') + + node.query( + f""" + CREATE TABLE test_delete( + k UInt32, + m DateTime, + n String + ) Engine=Redis('{address}', 0, 'clickhouse') PRIMARY KEY (k) + """ + ) + + node.query( + """ + INSERT INTO test_delete Values + (1, '2023-06-01 00:00:00', 'lili'), (2, '2023-06-02 00:00:00', 'lucy') + """ + ) + + response = node.query( + """ + ALTER TABLE test_delete DELETE WHERE k=1 + """ + ) + + print("delete response: ", response) + + response = TSV.toMat(node.query("SELECT k, m, n FROM test_delete FORMAT TSV")) + assert (len(response) == 1) + assert (response[0] == ['2', '2023-06-02 00:00:00', 'lucy']) + + response = node.query( + """ + ALTER TABLE test_delete DELETE WHERE m='2023-06-02 00:00:00' + """ + ) + + response = TSV.toMat(node.query("SELECT k, m, n FROM test_delete FORMAT TSV")) + assert (len(response) == 0) + From 010670457359862da5622baa24d50f0d5bf42557 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Wed, 31 May 2023 16:31:06 +0800 Subject: [PATCH 0279/1072] add truncate to redis storage --- src/Storages/StorageRedis.cpp | 11 +++--- tests/integration/test_storage_redis/test.py | 37 ++++++++++++++++++++ 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index f9a25470e2d..dd33f6e6839 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -87,6 +87,7 @@ public: return storage.getBySerializedKeys(raw_keys, nullptr); } + /// TODO scan may get duplicated keys Chunk generateFullScan() { /// redis scan ending @@ -480,17 +481,16 @@ SinkToStoragePtr StorageRedis::write( return std::make_shared(*this, metadata_snapshot); } -/// TODO use scan to reduce latency void StorageRedis::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &) { auto connection = getRedisConnection(pool, configuration); RedisCommand cmd_flush_db("FLUSHDB"); - cmd_flush_db << toString(configuration.db_index); - auto ret = connection->client->execute(cmd_flush_db); + cmd_flush_db.add("ASYNC"); + auto ret = connection->client->execute(cmd_flush_db); - if (ret.isNull() || ret.value() != "OK") - throw Exception(ErrorCodes::INTERNAL_REDIS_ERROR, "Fail to truncate redis table {}, for {}", table_id.getFullNameNotQuoted(), ret.value()); + if (ret != "OK") + throw Exception(ErrorCodes::INTERNAL_REDIS_ERROR, "Fail to truncate redis table {}, for {}", table_id.getFullNameNotQuoted(), ret); } void StorageRedis::checkMutationIsPossible(const MutationCommands & commands, const Settings & /* settings */) const @@ -580,6 +580,7 @@ void StorageRedis::mutate(const MutationCommands & commands, ContextPtr context_ } } +/// TODO support ttl void registerStorageRedis(StorageFactory & factory) { StorageFactory::StorageFeatures features{ diff --git a/tests/integration/test_storage_redis/test.py b/tests/integration/test_storage_redis/test.py index e77de99c649..ad1b0ada068 100644 --- a/tests/integration/test_storage_redis/test.py +++ b/tests/integration/test_storage_redis/test.py @@ -332,3 +332,40 @@ def test_delete(started_cluster): response = TSV.toMat(node.query("SELECT k, m, n FROM test_delete FORMAT TSV")) assert (len(response) == 0) + +def test_truncate(started_cluster): + client = get_redis_connection() + address = get_address_for_ch() + # clean all + client.flushall() + drop_table('test_truncate') + + node.query( + f""" + CREATE TABLE test_truncate( + k UInt32, + m DateTime, + n String + ) Engine=Redis('{address}', 0, 'clickhouse') PRIMARY KEY (k) + """ + ) + + node.query( + """ + INSERT INTO test_truncate Values + (1, '2023-06-01 00:00:00', 'lili'), (2, '2023-06-02 00:00:00', 'lucy') + """ + ) + + response = node.query( + """ + TRUNCATE TABLE test_truncate + """ + ) + + print("truncate table response: ", response) + + response = TSV.toMat(node.query("SELECT COUNT(*) FROM test_truncate FORMAT TSV")) + assert (len(response) == 1) + assert (response[0] == ['0']) + From bcf22c1ec79fad247be8c48f4bada508ef0d8063 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Wed, 31 May 2023 18:15:38 +0800 Subject: [PATCH 0280/1072] fix code style --- src/Storages/RedisCommon.cpp | 2 - src/Storages/RedisCommon.h | 2 +- src/Storages/StorageRedis.cpp | 3 +- src/TableFunctions/TableFunctionRedis.cpp | 3 - tests/integration/test_storage_redis/test.py | 119 ++++++++++-------- .../test_table_function_redis/test.py | 34 ++--- 6 files changed, 89 insertions(+), 74 deletions(-) diff --git a/src/Storages/RedisCommon.cpp b/src/Storages/RedisCommon.cpp index ba7c02fdac5..a0534a9e23b 100644 --- a/src/Storages/RedisCommon.cpp +++ b/src/Storages/RedisCommon.cpp @@ -8,10 +8,8 @@ namespace DB namespace ErrorCodes { - extern const int INVALID_REDIS_TABLE_STRUCTURE; extern const int INTERNAL_REDIS_ERROR; extern const int TIMEOUT_EXCEEDED; - extern const int BAD_ARGUMENTS; extern const int INVALID_REDIS_STORAGE_TYPE; } diff --git a/src/Storages/RedisCommon.h b/src/Storages/RedisCommon.h index 49c21c3277f..cf39be20ba9 100644 --- a/src/Storages/RedisCommon.h +++ b/src/Storages/RedisCommon.h @@ -54,7 +54,7 @@ using RedisClientPtr = std::unique_ptr; using RedisPool = BorrowedObjectPool; using RedisPoolPtr = std::shared_ptr; -/// Redis scan interator +/// Redis scan iterator using RedisIterator = int64_t; struct RedisConnection diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index dd33f6e6839..b17528c7eae 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -25,7 +25,8 @@ namespace DB namespace ErrorCodes { - extern const int NOT_IMPLEMENTED; + extern const int BAD_ARGUMENTS; + extern const int LOGICAL_ERROR; extern const int INTERNAL_REDIS_ERROR; } diff --git a/src/TableFunctions/TableFunctionRedis.cpp b/src/TableFunctions/TableFunctionRedis.cpp index 3db174fbcd8..bf147c08776 100644 --- a/src/TableFunctions/TableFunctionRedis.cpp +++ b/src/TableFunctions/TableFunctionRedis.cpp @@ -16,17 +16,14 @@ #include - namespace DB { namespace ErrorCodes { extern const int BAD_ARGUMENTS; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } - StoragePtr TableFunctionRedis::executeImpl( const ASTPtr & /*ast_function*/, ContextPtr context, const String & table_name, ColumnsDescription /*cached_columns*/) const { diff --git a/tests/integration/test_storage_redis/test.py b/tests/integration/test_storage_redis/test.py index ad1b0ada068..66d34ebc711 100644 --- a/tests/integration/test_storage_redis/test.py +++ b/tests/integration/test_storage_redis/test.py @@ -24,17 +24,17 @@ def started_cluster(): def get_redis_connection(db_id=0): client = redis.Redis( - host='localhost', port=cluster.redis_port, password="clickhouse", db=db_id + host="localhost", port=cluster.redis_port, password="clickhouse", db=db_id ) return client def get_address_for_ch(): - return cluster.redis_host + ':6379' + return cluster.redis_host + ":6379" def drop_table(table): - node.query(f"DROP TABLE IF EXISTS {table} SYNC"); + node.query(f"DROP TABLE IF EXISTS {table} SYNC") # see SerializationString.serializeBinary @@ -50,21 +50,21 @@ def serialize_binary_for_string(x): byte = length & 0x7F if length > 0x7F: byte |= 0x80 - buf += (bytes([byte])) + buf += bytes([byte]) length >>= 7 if not length: break # write data - buf += x.encode('utf-8') + buf += x.encode("utf-8") return bytes(buf) # see SerializationNumber.serializeBinary def serialize_binary_for_uint32(x): buf = bytearray() - packed_num = struct.pack('I', x) + packed_num = struct.pack("I", x) buf += packed_num - if sys.byteorder != 'little': + if sys.byteorder != "little": buf.reverse() return bytes(buf) @@ -75,7 +75,7 @@ def test_simple_select(started_cluster): # clean all client.flushall() - drop_table('test_simple_select') + drop_table("test_simple_select") data = {} for i in range(100): @@ -95,13 +95,17 @@ def test_simple_select(started_cluster): """ ) - response = TSV.toMat(node.query("SELECT k, v FROM test_simple_select WHERE k='0' FORMAT TSV")) - assert (len(response) == 1) - assert (response[0] == ['0', '0']) + response = TSV.toMat( + node.query("SELECT k, v FROM test_simple_select WHERE k='0' FORMAT TSV") + ) + assert len(response) == 1 + assert response[0] == ["0", "0"] - response = TSV.toMat(node.query("SELECT * FROM test_simple_select ORDER BY k FORMAT TSV")) - assert (len(response) == 100) - assert (response[0] == ['0', '0']) + response = TSV.toMat( + node.query("SELECT * FROM test_simple_select ORDER BY k FORMAT TSV") + ) + assert len(response) == 100 + assert response[0] == ["0", "0"] def test_select_int(started_cluster): @@ -110,7 +114,7 @@ def test_select_int(started_cluster): # clean all client.flushall() - drop_table('test_select_int') + drop_table("test_select_int") data = {} for i in range(100): @@ -130,20 +134,25 @@ def test_select_int(started_cluster): """ ) - response = TSV.toMat(node.query("SELECT k, v FROM test_select_int WHERE k=0 FORMAT TSV")) - assert (len(response) == 1) - assert (response[0] == ['0', '0']) + response = TSV.toMat( + node.query("SELECT k, v FROM test_select_int WHERE k=0 FORMAT TSV") + ) + assert len(response) == 1 + assert response[0] == ["0", "0"] + + response = TSV.toMat( + node.query("SELECT * FROM test_select_int ORDER BY k FORMAT TSV") + ) + assert len(response) == 100 + assert response[0] == ["0", "0"] - response = TSV.toMat(node.query("SELECT * FROM test_select_int ORDER BY k FORMAT TSV")) - assert (len(response) == 100) - assert (response[0] == ['0', '0']) def test_create_table(started_cluster): address = get_address_for_ch() # simple creation - drop_table('test_create_table') + drop_table("test_create_table") node.query( f""" CREATE TABLE test_create_table( @@ -154,7 +163,7 @@ def test_create_table(started_cluster): ) # simple creation with full engine args - drop_table('test_create_table') + drop_table("test_create_table") node.query( f""" CREATE TABLE test_create_table( @@ -164,7 +173,7 @@ def test_create_table(started_cluster): """ ) - drop_table('test_create_table') + drop_table("test_create_table") node.query( f""" CREATE TABLE test_create_table( @@ -175,7 +184,7 @@ def test_create_table(started_cluster): """ ) - drop_table('test_create_table') + drop_table("test_create_table") with pytest.raises(QueryRuntimeException): node.query( f""" @@ -187,7 +196,7 @@ def test_create_table(started_cluster): """ ) - drop_table('test_create_table') + drop_table("test_create_table") with pytest.raises(QueryRuntimeException): node.query( f""" @@ -226,19 +235,27 @@ def test_simple_insert(started_cluster): ) response = node.query("SELECT COUNT(*) FROM test_simple_insert FORMAT Values") - assert (response == '(2)') + assert response == "(2)" - response = TSV.toMat(node.query("SELECT k, m, n FROM test_simple_insert WHERE k=1 FORMAT TSV")) - assert (len(response) == 1) - assert (response[0] == ['1', '2023-06-01 00:00:00', 'lili']) + response = TSV.toMat( + node.query("SELECT k, m, n FROM test_simple_insert WHERE k=1 FORMAT TSV") + ) + assert len(response) == 1 + assert response[0] == ["1", "2023-06-01 00:00:00", "lili"] - response = TSV.toMat(node.query("SELECT k, m, n FROM test_simple_insert WHERE m='2023-06-01 00:00:00' FORMAT TSV")) - assert (len(response) == 1) - assert (response[0] == ['1', '2023-06-01 00:00:00', 'lili']) + response = TSV.toMat( + node.query( + "SELECT k, m, n FROM test_simple_insert WHERE m='2023-06-01 00:00:00' FORMAT TSV" + ) + ) + assert len(response) == 1 + assert response[0] == ["1", "2023-06-01 00:00:00", "lili"] - response = TSV.toMat(node.query("SELECT k, m, n FROM test_simple_insert WHERE n='lili' FORMAT TSV")) - assert (len(response) == 1) - assert (response[0] == ['1', '2023-06-01 00:00:00', 'lili']) + response = TSV.toMat( + node.query("SELECT k, m, n FROM test_simple_insert WHERE n='lili' FORMAT TSV") + ) + assert len(response) == 1 + assert response[0] == ["1", "2023-06-01 00:00:00", "lili"] def test_update(started_cluster): @@ -246,7 +263,7 @@ def test_update(started_cluster): address = get_address_for_ch() # clean all client.flushall() - drop_table('test_update') + drop_table("test_update") node.query( f""" @@ -271,11 +288,13 @@ def test_update(started_cluster): """ ) - print("update response: ", response) + print("update response: ", response) - response = TSV.toMat(node.query("SELECT k, m, n FROM test_update WHERE k=1 FORMAT TSV")) - assert (len(response) == 1) - assert (response[0] == ['1', '2023-06-03 00:00:00', 'lili']) + response = TSV.toMat( + node.query("SELECT k, m, n FROM test_update WHERE k=1 FORMAT TSV") + ) + assert len(response) == 1 + assert response[0] == ["1", "2023-06-03 00:00:00", "lili"] # can not update key with pytest.raises(QueryRuntimeException): @@ -292,7 +311,7 @@ def test_delete(started_cluster): # clean all client.flushall() - drop_table('test_delete') + drop_table("test_delete") node.query( f""" @@ -317,11 +336,11 @@ def test_delete(started_cluster): """ ) - print("delete response: ", response) + print("delete response: ", response) response = TSV.toMat(node.query("SELECT k, m, n FROM test_delete FORMAT TSV")) - assert (len(response) == 1) - assert (response[0] == ['2', '2023-06-02 00:00:00', 'lucy']) + assert len(response) == 1 + assert response[0] == ["2", "2023-06-02 00:00:00", "lucy"] response = node.query( """ @@ -330,7 +349,7 @@ def test_delete(started_cluster): ) response = TSV.toMat(node.query("SELECT k, m, n FROM test_delete FORMAT TSV")) - assert (len(response) == 0) + assert len(response) == 0 def test_truncate(started_cluster): @@ -338,7 +357,7 @@ def test_truncate(started_cluster): address = get_address_for_ch() # clean all client.flushall() - drop_table('test_truncate') + drop_table("test_truncate") node.query( f""" @@ -363,9 +382,9 @@ def test_truncate(started_cluster): """ ) - print("truncate table response: ", response) + print("truncate table response: ", response) response = TSV.toMat(node.query("SELECT COUNT(*) FROM test_truncate FORMAT TSV")) - assert (len(response) == 1) - assert (response[0] == ['0']) + assert len(response) == 1 + assert esponse[0] == ["0"] diff --git a/tests/integration/test_table_function_redis/test.py b/tests/integration/test_table_function_redis/test.py index 111276ec6dc..772e6d28141 100644 --- a/tests/integration/test_table_function_redis/test.py +++ b/tests/integration/test_table_function_redis/test.py @@ -25,13 +25,13 @@ def started_cluster(): def get_redis_connection(db_id=0): client = redis.Redis( - host='localhost', port=cluster.redis_port, password="clickhouse", db=db_id + host="localhost", port=cluster.redis_port, password="clickhouse", db=db_id ) return client def get_address_for_ch(): - return cluster.redis_host + ':6379' + return cluster.redis_host + ":6379" # see SerializationString.serializeBinary @@ -47,21 +47,21 @@ def serialize_binary_for_string(x): byte = length & 0x7F if length > 0x7F: byte |= 0x80 - buf += (bytes([byte])) + buf += bytes([byte]) length >>= 7 if not length: break # write data - buf += x.encode('utf-8') + buf += x.encode("utf-8") return bytes(buf) # see SerializationNumber.serializeBinary def serialize_binary_for_uint32(x): buf = bytearray() - packed_num = struct.pack('I', x) + packed_num = struct.pack("I", x) buf += packed_num - if sys.byteorder != 'little': + if sys.byteorder != "little": buf.reverse() return bytes(buf) @@ -92,8 +92,8 @@ def test_simple_select(started_cluster): FORMAT TSV """)) - assert (len(response) == 1) - assert (response[0] == ['0', '0']) + assert len(response) == 1 + assert response[0] == ["0", "0"] response = TSV.toMat(node.query( f""" @@ -106,8 +106,8 @@ def test_simple_select(started_cluster): FORMAT TSV """)) - assert (len(response) == 100) - assert (response[0] == ['0', '0']) + assert len(response) == 100 + assert response[0] == ["0", "0"] def test_create_table(started_cluster): @@ -153,7 +153,7 @@ def test_data_type(started_cluster): # string client.flushall() - value = serialize_binary_for_string('0') + value = serialize_binary_for_string("0") client.set(value, value) response = TSV.toMat(node.query( @@ -167,8 +167,8 @@ def test_data_type(started_cluster): FORMAT TSV """)) - assert (len(response) == 1) - assert (response[0] == ['0', '0']) + assert len(response) == 1 + assert response[0] == ["0", "0"] # number client.flushall() @@ -186,8 +186,8 @@ def test_data_type(started_cluster): FORMAT TSV """)) - assert (len(response) == 1) - assert (response[0] == ['0', '0']) + assert len(response) == 1 + assert response[0] == ["0", "0"] # datetime client.flushall() @@ -208,5 +208,5 @@ def test_data_type(started_cluster): FORMAT TSV """)) - assert (len(response) == 1) - assert (response[0] == ['2023-06-01 00:00:00', '2023-06-01 00:00:00']) + assert len(response) == 1 + assert response[0] == ["2023-06-01 00:00:00", "2023-06-01 00:00:00"] From 7cc37ab4b877dca8dcb76e328e39c8a62995cd0b Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Wed, 31 May 2023 19:35:17 +0800 Subject: [PATCH 0281/1072] add redis table engine/function docs --- docs/en/engines/table-engines/index.md | 1 + .../table-engines/integrations/index.md | 1 + .../table-engines/integrations/redis.md | 111 ++++++++++++++++++ .../en/sql-reference/table-functions/redis.md | 67 +++++++++++ 4 files changed, 180 insertions(+) create mode 100644 docs/en/engines/table-engines/integrations/redis.md create mode 100644 docs/en/sql-reference/table-functions/redis.md diff --git a/docs/en/engines/table-engines/index.md b/docs/en/engines/table-engines/index.md index d7c582164de..bd704d0e87e 100644 --- a/docs/en/engines/table-engines/index.md +++ b/docs/en/engines/table-engines/index.md @@ -53,6 +53,7 @@ Engines in the family: - [JDBC](../../engines/table-engines/integrations/jdbc.md) - [MySQL](../../engines/table-engines/integrations/mysql.md) - [MongoDB](../../engines/table-engines/integrations/mongodb.md) +- [Redis](../../engines/table-engines/integrations/redis.md) - [HDFS](../../engines/table-engines/integrations/hdfs.md) - [S3](../../engines/table-engines/integrations/s3.md) - [Kafka](../../engines/table-engines/integrations/kafka.md) diff --git a/docs/en/engines/table-engines/integrations/index.md b/docs/en/engines/table-engines/integrations/index.md index b321a644d32..93691a8adad 100644 --- a/docs/en/engines/table-engines/integrations/index.md +++ b/docs/en/engines/table-engines/integrations/index.md @@ -14,6 +14,7 @@ List of supported integrations: - [JDBC](../../../engines/table-engines/integrations/jdbc.md) - [MySQL](../../../engines/table-engines/integrations/mysql.md) - [MongoDB](../../../engines/table-engines/integrations/mongodb.md) +- [Redis](../../../engines/table-engines/integrations/redis.md) - [HDFS](../../../engines/table-engines/integrations/hdfs.md) - [S3](../../../engines/table-engines/integrations/s3.md) - [Kafka](../../../engines/table-engines/integrations/kafka.md) diff --git a/docs/en/engines/table-engines/integrations/redis.md b/docs/en/engines/table-engines/integrations/redis.md new file mode 100644 index 00000000000..8e5a974c459 --- /dev/null +++ b/docs/en/engines/table-engines/integrations/redis.md @@ -0,0 +1,111 @@ +--- +slug: /en/sql-reference/table-functions/redis +sidebar_position: 43 +sidebar_label: Redis +--- + +# Redis + +This engine allows integrating ClickHouse with [Redis](https://redis.io/). + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name +( + name1 [type1], + name2 [type2], + ... +) ENGINE = Redis(host:port[, db_index[, password[, pool_size]]]) PRIMARY KEY(primary_key_name); +``` + +**Engine Parameters** + +- `host:port` — Redis server address, you can ignore port and default Redis port 6379 will be used. + +- `db_index` — Redis db index range from 0 to 15, default is 0. + +- `password` — User password, default is blank string. + +- `pool_size` — Redis max connection pool size, default is 16. + +- `primary_key_name` - any column name in the column list. + +- `primary` must be specified, it supports only one column in the primary key. The primary key will be serialized in binary as a Redis key. + +- columns other than the primary key will be serialized in binary as Redis value in corresponding order. + +- queries with key equals or in filtering will be optimized to multi keys lookup from Redis. If queries without filtering key full table scan will happen which is a heavy operation. + +## Usage Example {#usage-example} + +Create a table in ClickHouse which allows to read data from Redis: + +``` sql +CREATE TABLE redis_table +( + `k` String, + `m` String, + `n` UInt32 +) +ENGINE = Redis('redis1:6379') PRIMARY KEY(k); +``` + +Insert: + +```sql +INSERT INTO redis_table Values('1', 1, '1', 1.0), ('2', 2, '2', 2.0); +``` + +Query: + +``` sql +SELECT COUNT(*) FROM redis_table; +``` + +``` text +┌─count()─┐ +│ 2 │ +└─────────┘ +``` + +``` sql +SELECT * FROM redis_table WHERE key='1'; +``` + +```text +┌─key─┬─v1─┬─v2─┬─v3─┐ +│ 1 │ 1 │ 1 │ 1 │ +└─────┴────┴────┴────┘ +``` + +``` sql +SELECT * FROM redis_table WHERE v1=2; +``` + +```text +┌─key─┬─v1─┬─v2─┬─v3─┐ +│ 2 │ 2 │ 2 │ 2 │ +└─────┴────┴────┴────┘ +``` + +Update: + +Note that the primary key cannot be updated. + +```sql +ALTER TABLE redis_table UPDATE v1=2 WHERE key='1'; +``` + +Delete: + +```sql +ALTER TABLE redis_table DELETE WHERE key='1'; +``` + +Truncate: + +Redis engine will flush db asynchronously. +```sql +TRUNCATE TABLE redis_table; +``` diff --git a/docs/en/sql-reference/table-functions/redis.md b/docs/en/sql-reference/table-functions/redis.md new file mode 100644 index 00000000000..5b32f118fb8 --- /dev/null +++ b/docs/en/sql-reference/table-functions/redis.md @@ -0,0 +1,67 @@ +--- +slug: /en/sql-reference/table-functions/redis +sidebar_position: 10 +sidebar_label: Redis +--- + +# Redis + +This table function allows integrating ClickHouse with [Redis](https://redis.io/). + +**Syntax** + +```sql +redis(host:port, key, structure[, db_index[, password[, pool_size]]]) +``` + +**Arguments** + +- `host:port` — Redis server address, you can ignore port and default Redis port 6379 will be used. + +- `key` — any column name in the column list. + +- `structure` — The schema for the ClickHouse table returned from this function. + +- `db_index` — Redis db index range from 0 to 15, default is 0. + +- `password` — User password, default is blank string. + +- `pool_size` — Redis max connection pool size, default is 16. + +- `primary` must be specified, it supports only one column in the primary key. The primary key will be serialized in binary as a Redis key. + +- columns other than the primary key will be serialized in binary as Redis value in corresponding order. + +- queries with key equals or in filtering will be optimized to multi keys lookup from Redis. If queries without filtering key full table scan will happen which is a heavy operation. + + +**Returned Value** + +A table object with key as Redis key, other columns packaged together as Redis value. + +## Usage Example {#usage-example} + +Create a table in ClickHouse which allows to read data from Redis: + +``` sql +CREATE TABLE redis_table +( + `k` String, + `m` String, + `n` UInt32 +) +ENGINE = Redis('redis1:6379') PRIMARY KEY(k); +``` + +```sql +SELECT * FROM redis( + 'redis1:6379', + 'key', + 'key String, v1 String, v2 UInt32' +) +``` + +**See Also** + +- [The `Redis` table engine](/docs/en/engines/table-engines/integrations/redis.md) +- [Using redis as a dictionary source](/docs/en/sql-reference/dictionaries/index.md#redis) From 4302ba44d46107a7e72b5f31fa948b52b0c9c40b Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Wed, 31 May 2023 20:32:34 +0800 Subject: [PATCH 0282/1072] fix code style --- tests/integration/test_storage_redis/test.py | 6 +- .../test_table_function_redis/test.py | 121 ++++++++++-------- 2 files changed, 71 insertions(+), 56 deletions(-) diff --git a/tests/integration/test_storage_redis/test.py b/tests/integration/test_storage_redis/test.py index 66d34ebc711..2fd97b9bebd 100644 --- a/tests/integration/test_storage_redis/test.py +++ b/tests/integration/test_storage_redis/test.py @@ -147,7 +147,6 @@ def test_select_int(started_cluster): assert response[0] == ["0", "0"] - def test_create_table(started_cluster): address = get_address_for_ch() @@ -215,7 +214,7 @@ def test_simple_insert(started_cluster): # clean all client.flushall() - drop_table('test_simple_insert') + drop_table("test_simple_insert") node.query( f""" @@ -386,5 +385,4 @@ def test_truncate(started_cluster): response = TSV.toMat(node.query("SELECT COUNT(*) FROM test_truncate FORMAT TSV")) assert len(response) == 1 - assert esponse[0] == ["0"] - + assert response[0] == ["0"] diff --git a/tests/integration/test_table_function_redis/test.py b/tests/integration/test_table_function_redis/test.py index 772e6d28141..8e9dd66d9e5 100644 --- a/tests/integration/test_table_function_redis/test.py +++ b/tests/integration/test_table_function_redis/test.py @@ -81,30 +81,36 @@ def test_simple_select(started_cluster): client.mset(data) client.close() - response = TSV.toMat(node.query( - f""" - SELECT - key, value - FROM - redis('{address}', 'key', 'key String, value String', 0, 'clickhouse', 10) - WHERE - key='0' - FORMAT TSV - """)) + response = TSV.toMat( + node.query( + f""" + SELECT + key, value + FROM + redis('{address}', 'key', 'key String, value String', 0, 'clickhouse', 10) + WHERE + key='0' + FORMAT TSV + """ + ) + ) assert len(response) == 1 assert response[0] == ["0", "0"] - response = TSV.toMat(node.query( - f""" - SELECT - * - FROM - redis('{address}', 'key', 'key String, value String', 0, 'clickhouse', 10) - ORDER BY - key - FORMAT TSV - """)) + response = TSV.toMat( + node.query( + f""" + SELECT + * + FROM + redis('{address}', 'key', 'key String, value String', 0, 'clickhouse', 10) + ORDER BY + key + FORMAT TSV + """ + ) + ) assert len(response) == 100 assert response[0] == ["0", "0"] @@ -124,7 +130,8 @@ def test_create_table(started_cluster): * FROM redis('{address}', 'k', 'k String, v UInt32', 0, 'clickhouse', 10) - """) + """ + ) # illegal data type with pytest.raises(QueryRuntimeException): @@ -134,7 +141,8 @@ def test_create_table(started_cluster): * FROM redis('{address}', 'k', 'k not_exist_type, v String', 0, 'clickhouse', 10) - """) + """ + ) # illegal key with pytest.raises(QueryRuntimeException): @@ -156,16 +164,19 @@ def test_data_type(started_cluster): value = serialize_binary_for_string("0") client.set(value, value) - response = TSV.toMat(node.query( - f""" - SELECT - * - FROM - redis('{address}', 'k', 'k String, v String', 0, 'clickhouse', 10) - WHERE - k='0' - FORMAT TSV - """)) + response = TSV.toMat( + node.query( + f""" + SELECT + * + FROM + redis('{address}', 'k', 'k String, v String', 0, 'clickhouse', 10) + WHERE + k='0' + FORMAT TSV + """ + ) + ) assert len(response) == 1 assert response[0] == ["0", "0"] @@ -175,16 +186,19 @@ def test_data_type(started_cluster): value = serialize_binary_for_uint32(0) client.set(value, value) - response = TSV.toMat(node.query( - f""" - SELECT - * - FROM - redis('{address}', 'k', 'k UInt32, v UInt32', 0, 'clickhouse', 10) - WHERE - k=0 - FORMAT TSV - """)) + response = TSV.toMat( + node.query( + f""" + SELECT + * + FROM + redis('{address}', 'k', 'k UInt32, v UInt32', 0, 'clickhouse', 10) + WHERE + k=0 + FORMAT TSV + """ + ) + ) assert len(response) == 1 assert response[0] == ["0", "0"] @@ -197,16 +211,19 @@ def test_data_type(started_cluster): value = serialize_binary_for_uint32(int(seconds_since_epoch)) client.set(value, value) - response = TSV.toMat(node.query( - f""" - SELECT - * - FROM - redis('{address}', 'k', 'k DateTime, v DateTime', 0, 'clickhouse', 10) - WHERE - k='2023-06-01 00:00:00' - FORMAT TSV - """)) + response = TSV.toMat( + node.query( + f""" + SELECT + * + FROM + redis('{address}', 'k', 'k DateTime, v DateTime', 0, 'clickhouse', 10) + WHERE + k='2023-06-01 00:00:00' + FORMAT TSV + """ + ) + ) assert len(response) == 1 assert response[0] == ["2023-06-01 00:00:00", "2023-06-01 00:00:00"] From 119c2200a95563e6814042618c4f3365bf7386fb Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Wed, 31 May 2023 22:23:09 +0800 Subject: [PATCH 0283/1072] fix merged error --- src/Storages/StorageRedis.cpp | 16 ++++++++++------ .../test_table_function_redis/test.py | 3 ++- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index b17528c7eae..973a77a5f98 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -520,14 +520,16 @@ void StorageRedis::mutate(const MutationCommands & commands, ContextPtr context_ if (commands.front().type == MutationCommand::Type::DELETE) { + MutationsInterpreter::Settings settings(true); + settings.return_all_columns = true; + settings.return_mutated_rows = true; + auto interpreter = std::make_unique( storage_ptr, metadata_snapshot, commands, context_, - /*can_execute_*/ true, - /*return_all_columns_*/ true, - /*return_mutated_rows*/ true); + settings); auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute()); PullingPipelineExecutor executor(pipeline); @@ -561,14 +563,16 @@ void StorageRedis::mutate(const MutationCommands & commands, ContextPtr context_ if (commands.front().column_to_update_expression.contains(primary_key)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Primary key cannot be updated (cannot update column {})", primary_key); + MutationsInterpreter::Settings settings(true); + settings.return_all_columns = true; + settings.return_mutated_rows = true; + auto interpreter = std::make_unique( storage_ptr, metadata_snapshot, commands, context_, - /*can_execute_*/ true, - /*return_all_columns*/ true, - /*return_mutated_rows*/ true); + settings); auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute()); PullingPipelineExecutor executor(pipeline); diff --git a/tests/integration/test_table_function_redis/test.py b/tests/integration/test_table_function_redis/test.py index 8e9dd66d9e5..f4bcebe2f90 100644 --- a/tests/integration/test_table_function_redis/test.py +++ b/tests/integration/test_table_function_redis/test.py @@ -152,7 +152,8 @@ def test_create_table(started_cluster): * FROM redis('{address}', 'not_exist_key', 'k not_exist_type, v String', 0, 'clickhouse', 10) - """) + """ + ) def test_data_type(started_cluster): From e6d1b3c35141d7f20e17fd1b823df4c86b7f6c29 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Wed, 31 May 2023 23:55:41 +0800 Subject: [PATCH 0284/1072] little fix --- src/Dictionaries/RedisDictionarySource.cpp | 2 +- src/Dictionaries/RedisSource.cpp | 2 +- src/Storages/StorageRedis.cpp | 113 ++++++++------------- 3 files changed, 45 insertions(+), 72 deletions(-) diff --git a/src/Dictionaries/RedisDictionarySource.cpp b/src/Dictionaries/RedisDictionarySource.cpp index c52c3425d1b..1736cdff306 100644 --- a/src/Dictionaries/RedisDictionarySource.cpp +++ b/src/Dictionaries/RedisDictionarySource.cpp @@ -41,7 +41,7 @@ namespace DB .host = host, .port = static_cast(port), .db_index = config.getUInt(redis_config_prefix + ".db_index", DEFAULT_REDIS_DB_INDEX), - .password = config.getString(redis_config_prefix + ".password", ""), + .password = config.getString(redis_config_prefix + ".password", DEFAULT_REDIS_PASSWORD), .storage_type = parseStorageType(config.getString(redis_config_prefix + ".storage_type", "")), .pool_size = config.getUInt(redis_config_prefix + ".pool_size", DEFAULT_REDIS_POOL_SIZE), }; diff --git a/src/Dictionaries/RedisSource.cpp b/src/Dictionaries/RedisSource.cpp index 5d8a475cad4..719c0278707 100644 --- a/src/Dictionaries/RedisSource.cpp +++ b/src/Dictionaries/RedisSource.cpp @@ -21,7 +21,7 @@ namespace DB } - RedisSource::RedisSource( + RedisSource::RedisSource( RedisConnectionPtr connection_, const RedisArray & keys_, const RedisStorageType & storage_type_, diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index 973a77a5f98..64e5c1ad5f7 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -1,24 +1,25 @@ -#include -#include -#include -#include - #include +#include +#include +#include +#include #include +#include #include #include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include namespace DB { @@ -51,11 +52,7 @@ public: { } - RedisDataSource( - StorageRedis & storage_, - const Block & header, - const size_t max_block_size_, - const String & pattern_ = "*") + RedisDataSource(StorageRedis & storage_, const Block & header, const size_t max_block_size_, const String & pattern_ = "*") : ISource(header) , storage(storage_) , primary_key_pos(getPrimaryKeyPos(header, storage.getPrimaryKey())) @@ -85,6 +82,7 @@ public: const auto & key_column_type = sample_block.getByName(storage.getPrimaryKey().at(0)).type; auto raw_keys = serializeKeysToRawString(it, end, key_column_type, max_block_size); + return storage.getBySerializedKeys(raw_keys, nullptr); } @@ -109,11 +107,11 @@ public: MutableColumns columns = sample_block.cloneEmptyColumns(); RedisArray values = storage.multiGet(scan_keys); - for (size_t i = 0; i(i).isNull(); i++) + for (size_t i = 0; i < scan_keys.size() && !values.get(i).isNull(); i++) { fillColumns(scan_keys.get(i).value(), - values.get(i).value(), - primary_key_pos, sample_block, columns + values.get(i).value(), + primary_key_pos, sample_block, columns ); } @@ -143,9 +141,7 @@ private: class RedisSink : public SinkToStorage { public: - RedisSink( - StorageRedis & storage_, - const StorageMetadataPtr & metadata_snapshot_); + RedisSink(StorageRedis & storage_, const StorageMetadataPtr & metadata_snapshot_); void consume(Chunk chunk) override; String getName() const override { return "RedisSink"; } @@ -156,9 +152,7 @@ private: size_t primary_key_pos = 0; }; -RedisSink::RedisSink( - StorageRedis & storage_, - const StorageMetadataPtr & metadata_snapshot_) +RedisSink::RedisSink(StorageRedis & storage_, const StorageMetadataPtr & metadata_snapshot_) : SinkToStorage(metadata_snapshot_->getSampleBlock()) , storage(storage_) , metadata_snapshot(metadata_snapshot_) @@ -194,6 +188,7 @@ void RedisSink::consume(Chunk chunk) data.add(wb_key.str()); data.add(wb_value.str()); } + storage.multiSet(data); } @@ -258,8 +253,8 @@ Pipe StorageRedis::read( size_t begin = num_keys * thread_idx / num_threads; size_t end = num_keys * (thread_idx + 1) / num_threads; - pipes.emplace_back(std::make_shared( - *this, header, keys, keys->begin() + begin, keys->begin() + end, max_block_size)); + pipes.emplace_back( + std::make_shared(*this, header, keys, keys->begin() + begin, keys->begin() + end, max_block_size)); } return Pipe::unitePipes(std::move(pipes)); } @@ -272,7 +267,7 @@ namespace { RedisConfiguration configuration; - if (engine_args.size() < 1) + if (engine_args.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad arguments count when creating Redis table engine"); if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, context)) @@ -336,18 +331,11 @@ namespace throw Exception(ErrorCodes::BAD_ARGUMENTS, "StorageRedis must require one column in primary key"); } - return std::make_shared( - args.table_id, - configuration, - args.getContext(), - metadata, - primary_key_names[0]); + return std::make_shared(args.table_id, configuration, args.getContext(), metadata, primary_key_names[0]); } } -Chunk StorageRedis::getBySerializedKeys( - const std::vector & keys, - PaddedPODArray * null_map) const +Chunk StorageRedis::getBySerializedKeys(const std::vector & keys, PaddedPODArray * null_map) const { RedisArray redis_keys; for (const auto & key : keys) @@ -355,9 +343,7 @@ Chunk StorageRedis::getBySerializedKeys( return getBySerializedKeys(redis_keys, null_map); } -Chunk StorageRedis::getBySerializedKeys( - const RedisArray & keys, - PaddedPODArray * null_map) const +Chunk StorageRedis::getBySerializedKeys(const RedisArray & keys, PaddedPODArray * null_map) const { Block sample_block = getInMemoryMetadataPtr()->getSampleBlock(); @@ -379,8 +365,8 @@ Chunk StorageRedis::getBySerializedKeys( if (!values.get(i).isNull()) { fillColumns(keys.get(i).value(), - values.get(i).value(), - primary_key_pos, sample_block, columns + values.get(i).value(), + primary_key_pos, sample_block, columns ); } else /// key not found @@ -433,8 +419,7 @@ void StorageRedis::multiSet(const RedisArray & data) const auto ret = connection->client->execute(cmd_mget); if (ret != "OK") - throw Exception(ErrorCodes::INTERNAL_REDIS_ERROR, "Fail to write to redis table {}, for {}", - table_id.getFullNameNotQuoted(), ret); + throw Exception(ErrorCodes::INTERNAL_REDIS_ERROR, "Fail to write to redis table {}, for {}", table_id.getFullNameNotQuoted(), ret); } RedisInteger StorageRedis::multiDelete(const RedisArray & keys) const @@ -447,16 +432,17 @@ RedisInteger StorageRedis::multiDelete(const RedisArray & keys) const auto ret = connection->client->execute(cmd); if (ret != static_cast(keys.size())) - LOG_DEBUG(log, "Try to delete {} rows but actually deleted {} rows from redis table {}.", - keys.size(), ret, table_id.getFullNameNotQuoted()); + LOG_DEBUG( + log, + "Try to delete {} rows but actually deleted {} rows from redis table {}.", + keys.size(), + ret, + table_id.getFullNameNotQuoted()); return ret; } -Chunk StorageRedis::getByKeys( - const ColumnsWithTypeAndName & keys, - PaddedPODArray & null_map, - const Names &) const +Chunk StorageRedis::getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & null_map, const Names &) const { if (keys.size() != 1) throw Exception(ErrorCodes::LOGICAL_ERROR, "StorageRedis supports only one key, got: {}", keys.size()); @@ -474,10 +460,7 @@ Block StorageRedis::getSampleBlock(const Names &) const return getInMemoryMetadataPtr()->getSampleBlock(); } -SinkToStoragePtr StorageRedis::write( - const ASTPtr & /*query*/, - const StorageMetadataPtr & metadata_snapshot, - ContextPtr /*context*/) +SinkToStoragePtr StorageRedis::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr /*context*/) { return std::make_shared(*this, metadata_snapshot); } @@ -524,12 +507,7 @@ void StorageRedis::mutate(const MutationCommands & commands, ContextPtr context_ settings.return_all_columns = true; settings.return_mutated_rows = true; - auto interpreter = std::make_unique( - storage_ptr, - metadata_snapshot, - commands, - context_, - settings); + auto interpreter = std::make_unique(storage_ptr, metadata_snapshot, commands, context_, settings); auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute()); PullingPipelineExecutor executor(pipeline); @@ -567,12 +545,7 @@ void StorageRedis::mutate(const MutationCommands & commands, ContextPtr context_ settings.return_all_columns = true; settings.return_mutated_rows = true; - auto interpreter = std::make_unique( - storage_ptr, - metadata_snapshot, - commands, - context_, - settings); + auto interpreter = std::make_unique(storage_ptr, metadata_snapshot, commands, context_, settings); auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute()); PullingPipelineExecutor executor(pipeline); From dc6102392785f6368c13ea59a5c0f5273425567c Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Thu, 1 Jun 2023 14:36:03 +0800 Subject: [PATCH 0285/1072] add sync mode to redis storage truncate --- docs/en/engines/table-engines/integrations/redis.md | 5 +++-- src/Storages/StorageRedis.cpp | 10 ++++++++-- .../02117_show_create_table_system.reference | 6 +++--- ...ll_new_table_functions_must_be_documented.reference | 2 +- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/redis.md b/docs/en/engines/table-engines/integrations/redis.md index 8e5a974c459..6cfc60c836c 100644 --- a/docs/en/engines/table-engines/integrations/redis.md +++ b/docs/en/engines/table-engines/integrations/redis.md @@ -105,7 +105,8 @@ ALTER TABLE redis_table DELETE WHERE key='1'; Truncate: -Redis engine will flush db asynchronously. +Flush Redis db asynchronously. Also `Truncate` support SYNC mode. + ```sql -TRUNCATE TABLE redis_table; +TRUNCATE TABLE redis_table SYNC; ``` diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index 64e5c1ad5f7..97f1dbce6da 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -465,12 +466,17 @@ SinkToStoragePtr StorageRedis::write(const ASTPtr & /*query*/, const StorageMeta return std::make_shared(*this, metadata_snapshot); } -void StorageRedis::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &) +void StorageRedis::truncate(const ASTPtr & query, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &) { auto connection = getRedisConnection(pool, configuration); + auto * truncate_query = query->as(); + assert(truncate_query != nullptr); + RedisCommand cmd_flush_db("FLUSHDB"); - cmd_flush_db.add("ASYNC"); + if (!truncate_query->sync) + cmd_flush_db.add("ASYNC"); + auto ret = connection->client->execute(cmd_flush_db); if (ret != "OK") diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index 724118f7bc1..10149bfc7bf 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -297,7 +297,7 @@ CREATE TABLE system.grants ( `user_name` Nullable(String), `role_name` Nullable(String), - `access_type` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION CONTROL' = 96, 'SYSTEM SHUTDOWN' = 97, 'SYSTEM DROP DNS CACHE' = 98, 'SYSTEM DROP MARK CACHE' = 99, 'SYSTEM DROP UNCOMPRESSED CACHE' = 100, 'SYSTEM DROP MMAP CACHE' = 101, 'SYSTEM DROP QUERY CACHE' = 102, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 103, 'SYSTEM DROP FILESYSTEM CACHE' = 104, 'SYSTEM DROP SCHEMA CACHE' = 105, 'SYSTEM DROP S3 CLIENT CACHE' = 106, 'SYSTEM DROP CACHE' = 107, 'SYSTEM RELOAD CONFIG' = 108, 'SYSTEM RELOAD USERS' = 109, 'SYSTEM RELOAD SYMBOLS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH' = 135, 'SYSTEM THREAD FUZZER' = 136, 'SYSTEM UNFREEZE' = 137, 'SYSTEM FAILPOINT' = 138, 'SYSTEM' = 139, 'dictGet' = 140, 'displaySecretsInShowAndSelect' = 141, 'addressToLine' = 142, 'addressToLineWithInlines' = 143, 'addressToSymbol' = 144, 'demangle' = 145, 'INTROSPECTION' = 146, 'FILE' = 147, 'URL' = 148, 'REMOTE' = 149, 'MONGO' = 150, 'Redis' = 151, 'MEILISEARCH' = 152, 'MYSQL' = 153, 'POSTGRES' = 154, 'SQLITE' = 155, 'ODBC' = 156, 'JDBC' = 157, 'HDFS' = 158, 'S3' = 159, 'HIVE' = 160, 'SOURCES' = 161, 'CLUSTER' = 162, 'ALL' = 163, 'NONE' = 164), + `access_type` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION CONTROL' = 96, 'SYSTEM SHUTDOWN' = 97, 'SYSTEM DROP DNS CACHE' = 98, 'SYSTEM DROP MARK CACHE' = 99, 'SYSTEM DROP UNCOMPRESSED CACHE' = 100, 'SYSTEM DROP MMAP CACHE' = 101, 'SYSTEM DROP QUERY CACHE' = 102, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 103, 'SYSTEM DROP FILESYSTEM CACHE' = 104, 'SYSTEM DROP SCHEMA CACHE' = 105, 'SYSTEM DROP S3 CLIENT CACHE' = 106, 'SYSTEM DROP CACHE' = 107, 'SYSTEM RELOAD CONFIG' = 108, 'SYSTEM RELOAD USERS' = 109, 'SYSTEM RELOAD SYMBOLS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH' = 135, 'SYSTEM THREAD FUZZER' = 136, 'SYSTEM UNFREEZE' = 137, 'SYSTEM FAILPOINT' = 138, 'SYSTEM' = 139, 'dictGet' = 140, 'displaySecretsInShowAndSelect' = 141, 'addressToLine' = 142, 'addressToLineWithInlines' = 143, 'addressToSymbol' = 144, 'demangle' = 145, 'INTROSPECTION' = 146, 'FILE' = 147, 'URL' = 148, 'REMOTE' = 149, 'MONGO' = 150, 'REDIS' = 151, 'MEILISEARCH' = 152, 'MYSQL' = 153, 'POSTGRES' = 154, 'SQLITE' = 155, 'ODBC' = 156, 'JDBC' = 157, 'HDFS' = 158, 'S3' = 159, 'HIVE' = 160, 'SOURCES' = 161, 'CLUSTER' = 162, 'ALL' = 163, 'NONE' = 164), `database` Nullable(String), `table` Nullable(String), `column` Nullable(String), @@ -581,10 +581,10 @@ ENGINE = SystemPartsColumns COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.privileges ( - `privilege` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION CONTROL' = 96, 'SYSTEM SHUTDOWN' = 97, 'SYSTEM DROP DNS CACHE' = 98, 'SYSTEM DROP MARK CACHE' = 99, 'SYSTEM DROP UNCOMPRESSED CACHE' = 100, 'SYSTEM DROP MMAP CACHE' = 101, 'SYSTEM DROP QUERY CACHE' = 102, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 103, 'SYSTEM DROP FILESYSTEM CACHE' = 104, 'SYSTEM DROP SCHEMA CACHE' = 105, 'SYSTEM DROP S3 CLIENT CACHE' = 106, 'SYSTEM DROP CACHE' = 107, 'SYSTEM RELOAD CONFIG' = 108, 'SYSTEM RELOAD USERS' = 109, 'SYSTEM RELOAD SYMBOLS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH' = 135, 'SYSTEM THREAD FUZZER' = 136, 'SYSTEM UNFREEZE' = 137, 'SYSTEM FAILPOINT' = 138, 'SYSTEM' = 139, 'dictGet' = 140, 'displaySecretsInShowAndSelect' = 141, 'addressToLine' = 142, 'addressToLineWithInlines' = 143, 'addressToSymbol' = 144, 'demangle' = 145, 'INTROSPECTION' = 146, 'FILE' = 147, 'URL' = 148, 'REMOTE' = 149, 'MONGO' = 150, 'MEILISEARCH' = 151, 'MYSQL' = 152, 'POSTGRES' = 153, 'SQLITE' = 154, 'ODBC' = 155, 'JDBC' = 156, 'HDFS' = 157, 'S3' = 158, 'HIVE' = 159, 'SOURCES' = 160, 'CLUSTER' = 161, 'ALL' = 162, 'NONE' = 163), + `privilege` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION CONTROL' = 96, 'SYSTEM SHUTDOWN' = 97, 'SYSTEM DROP DNS CACHE' = 98, 'SYSTEM DROP MARK CACHE' = 99, 'SYSTEM DROP UNCOMPRESSED CACHE' = 100, 'SYSTEM DROP MMAP CACHE' = 101, 'SYSTEM DROP QUERY CACHE' = 102, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 103, 'SYSTEM DROP FILESYSTEM CACHE' = 104, 'SYSTEM DROP SCHEMA CACHE' = 105, 'SYSTEM DROP S3 CLIENT CACHE' = 106, 'SYSTEM DROP CACHE' = 107, 'SYSTEM RELOAD CONFIG' = 108, 'SYSTEM RELOAD USERS' = 109, 'SYSTEM RELOAD SYMBOLS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH' = 135, 'SYSTEM THREAD FUZZER' = 136, 'SYSTEM UNFREEZE' = 137, 'SYSTEM FAILPOINT' = 138, 'SYSTEM' = 139, 'dictGet' = 140, 'displaySecretsInShowAndSelect' = 141, 'addressToLine' = 142, 'addressToLineWithInlines' = 143, 'addressToSymbol' = 144, 'demangle' = 145, 'INTROSPECTION' = 146, 'FILE' = 147, 'URL' = 148, 'REMOTE' = 149, 'MONGO' = 150, 'REDIS' = 151, 'MEILISEARCH' = 152, 'MYSQL' = 153, 'POSTGRES' = 154, 'SQLITE' = 155, 'ODBC' = 156, 'JDBC' = 157, 'HDFS' = 158, 'S3' = 159, 'HIVE' = 160, 'SOURCES' = 161, 'CLUSTER' = 162, 'ALL' = 163, 'NONE' = 164), `aliases` Array(String), `level` Nullable(Enum8('GLOBAL' = 0, 'DATABASE' = 1, 'TABLE' = 2, 'DICTIONARY' = 3, 'VIEW' = 4, 'COLUMN' = 5, 'NAMED_COLLECTION' = 6)), - `parent_group` Nullable(Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION CONTROL' = 96, 'SYSTEM SHUTDOWN' = 97, 'SYSTEM DROP DNS CACHE' = 98, 'SYSTEM DROP MARK CACHE' = 99, 'SYSTEM DROP UNCOMPRESSED CACHE' = 100, 'SYSTEM DROP MMAP CACHE' = 101, 'SYSTEM DROP QUERY CACHE' = 102, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 103, 'SYSTEM DROP FILESYSTEM CACHE' = 104, 'SYSTEM DROP SCHEMA CACHE' = 105, 'SYSTEM DROP S3 CLIENT CACHE' = 106, 'SYSTEM DROP CACHE' = 107, 'SYSTEM RELOAD CONFIG' = 108, 'SYSTEM RELOAD USERS' = 109, 'SYSTEM RELOAD SYMBOLS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH' = 135, 'SYSTEM THREAD FUZZER' = 136, 'SYSTEM UNFREEZE' = 137, 'SYSTEM FAILPOINT' = 138, 'SYSTEM' = 139, 'dictGet' = 140, 'displaySecretsInShowAndSelect' = 141, 'addressToLine' = 142, 'addressToLineWithInlines' = 143, 'addressToSymbol' = 144, 'demangle' = 145, 'INTROSPECTION' = 146, 'FILE' = 147, 'URL' = 148, 'REMOTE' = 149, 'MONGO' = 150, 'MEILISEARCH' = 151, 'MYSQL' = 152, 'POSTGRES' = 153, 'SQLITE' = 154, 'ODBC' = 155, 'JDBC' = 156, 'HDFS' = 157, 'S3' = 158, 'HIVE' = 159, 'SOURCES' = 160, 'CLUSTER' = 161, 'ALL' = 162, 'NONE' = 163)) + `parent_group` Nullable(Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION CONTROL' = 96, 'SYSTEM SHUTDOWN' = 97, 'SYSTEM DROP DNS CACHE' = 98, 'SYSTEM DROP MARK CACHE' = 99, 'SYSTEM DROP UNCOMPRESSED CACHE' = 100, 'SYSTEM DROP MMAP CACHE' = 101, 'SYSTEM DROP QUERY CACHE' = 102, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 103, 'SYSTEM DROP FILESYSTEM CACHE' = 104, 'SYSTEM DROP SCHEMA CACHE' = 105, 'SYSTEM DROP S3 CLIENT CACHE' = 106, 'SYSTEM DROP CACHE' = 107, 'SYSTEM RELOAD CONFIG' = 108, 'SYSTEM RELOAD USERS' = 109, 'SYSTEM RELOAD SYMBOLS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH' = 135, 'SYSTEM THREAD FUZZER' = 136, 'SYSTEM UNFREEZE' = 137, 'SYSTEM FAILPOINT' = 138, 'SYSTEM' = 139, 'dictGet' = 140, 'displaySecretsInShowAndSelect' = 141, 'addressToLine' = 142, 'addressToLineWithInlines' = 143, 'addressToSymbol' = 144, 'demangle' = 145, 'INTROSPECTION' = 146, 'FILE' = 147, 'URL' = 148, 'REMOTE' = 149, 'MONGO' = 150, 'REDIS' = 151, 'MEILISEARCH' = 152, 'MYSQL' = 153, 'POSTGRES' = 154, 'SQLITE' = 155, 'ODBC' = 156, 'JDBC' = 157, 'HDFS' = 158, 'S3' = 159, 'HIVE' = 160, 'SOURCES' = 161, 'CLUSTER' = 162, 'ALL' = 163, 'NONE' = 164)) ) ENGINE = SystemPrivileges COMMENT 'SYSTEM TABLE is built on the fly.' diff --git a/tests/queries/0_stateless/02414_all_new_table_functions_must_be_documented.reference b/tests/queries/0_stateless/02414_all_new_table_functions_must_be_documented.reference index 4f16e57d606..bc83e626207 100644 --- a/tests/queries/0_stateless/02414_all_new_table_functions_must_be_documented.reference +++ b/tests/queries/0_stateless/02414_all_new_table_functions_must_be_documented.reference @@ -9,11 +9,11 @@ jdbc meilisearch merge mongodb -redis null numbers numbers_mt odbc +redis remote remoteSecure url From ee363920f804107c6b8d96b746364cb252c6ae8e Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Thu, 1 Jun 2023 15:34:21 +0800 Subject: [PATCH 0286/1072] fix fast test --- tests/queries/0_stateless/01271_show_privileges.reference | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index 5ada21e31f4..2c98b8cc190 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -148,6 +148,7 @@ INTROSPECTION ['INTROSPECTION FUNCTIONS'] \N ALL FILE [] GLOBAL SOURCES URL [] GLOBAL SOURCES REMOTE [] GLOBAL SOURCES +MONGO [] GLOBAL SOURCES REDIS [] GLOBAL SOURCES MEILISEARCH [] GLOBAL SOURCES MYSQL [] GLOBAL SOURCES From ef6fde8264557a737fdc47a0486805f07b489abd Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Thu, 1 Jun 2023 19:28:13 +0800 Subject: [PATCH 0287/1072] fix build error for dwrwin --- src/Storages/RedisCommon.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Storages/RedisCommon.h b/src/Storages/RedisCommon.h index cf39be20ba9..4cc358e6536 100644 --- a/src/Storages/RedisCommon.h +++ b/src/Storages/RedisCommon.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -48,7 +49,7 @@ using RedisArray = Poco::Redis::Array; using RedisArrayPtr = std::shared_ptr; using RedisBulkString = Poco::Redis::BulkString; using RedisSimpleString = String; -using RedisInteger = Int64; +using RedisInteger = Poco::Int64; using RedisClientPtr = std::unique_ptr; using RedisPool = BorrowedObjectPool; From b01db870d8335ea0bdc9f4462b42c61c389774f7 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Fri, 2 Jun 2023 10:04:16 +0800 Subject: [PATCH 0288/1072] normalize redis table function db name --- src/TableFunctions/TableFunctionRedis.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/TableFunctions/TableFunctionRedis.cpp b/src/TableFunctions/TableFunctionRedis.cpp index bf147c08776..ec659ae61e0 100644 --- a/src/TableFunctions/TableFunctionRedis.cpp +++ b/src/TableFunctions/TableFunctionRedis.cpp @@ -32,8 +32,9 @@ StoragePtr TableFunctionRedis::executeImpl( StorageInMemoryMetadata metadata; metadata.setColumns(columns); + String db_name = "redis" + getDatabaseName() + "_db_" + toString(configuration.db_index); auto storage = std::make_shared( - StorageID(toString(configuration.db_index), table_name), configuration, context, metadata, primary_key); + StorageID(db_name, table_name), configuration, context, metadata, primary_key); storage->startup(); return storage; } From 8a10baec7f73d1e40fbd018daed0f1786d95442a Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Fri, 2 Jun 2023 06:25:00 +0000 Subject: [PATCH 0289/1072] Add dateTime range check --- src/Functions/FunctionsConversion.h | 38 +++++++++---------- .../01556_accurate_cast_or_null.reference | 1 + .../01556_accurate_cast_or_null.sql | 5 ++- .../0_stateless/01601_accurate_cast.sql | 4 +- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index d77090afe71..d5b5f6ae28a 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -365,6 +365,12 @@ template struct ConvertImpl +static bool CheckDateRange(const FromType & value) +{ + return value >= 0 && value <= DATE_LUT_MAX_DAY_NUM; +} + template struct ToDateTransform32Or64 { @@ -372,7 +378,7 @@ struct ToDateTransform32Or64 static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) { - return from >= 0; + return CheckDateRange(from); } static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) @@ -391,7 +397,7 @@ struct ToDateTransform32Or64Signed static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) { - return from >= 0; + return CheckDateRange(from); } static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) @@ -413,7 +419,7 @@ struct ToDateTransform8Or16Signed static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) { - return from >= 0; + return CheckDateRange(from); } static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) @@ -434,11 +440,6 @@ struct ToDate32Transform32Or64 { static constexpr auto name = "toDate32"; - static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) - { - return from >= 0; - } - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) { return (from < DATE_LUT_MAX_EXTEND_DAY_NUM) @@ -452,11 +453,6 @@ struct ToDate32Transform32Or64Signed { static constexpr auto name = "toDate32"; - static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) - { - return from >= 0; - } - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) { static const Int32 daynum_min_offset = -static_cast(DateLUT::instance().getDayNumOffsetEpoch()); @@ -473,11 +469,6 @@ struct ToDate32Transform8Or16Signed { static constexpr auto name = "toDate32"; - static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) - { - return from >= 0; - } - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) { return from; @@ -527,6 +518,11 @@ template struct ConvertImpl struct ConvertImpl : DateTimeTransformImpl> {}; +template +static bool CheckDateTimeRange(const FromType & value) +{ + return value >= 0 && value <= 0xFFFFFFFF; +} template struct ToDateTimeTransform64 @@ -535,7 +531,7 @@ struct ToDateTimeTransform64 static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) { - return from >= 0; + return CheckDateTimeRange(from); } static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) @@ -551,7 +547,7 @@ struct ToDateTimeTransformSigned static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) { - return from >= 0; + return CheckDateTimeRange(from); } static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) @@ -569,7 +565,7 @@ struct ToDateTimeTransform64Signed static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) { - return from >= 0; + return CheckDateTimeRange(from); } static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) diff --git a/tests/queries/0_stateless/01556_accurate_cast_or_null.reference b/tests/queries/0_stateless/01556_accurate_cast_or_null.reference index cbdf72e9910..21faa830636 100644 --- a/tests/queries/0_stateless/01556_accurate_cast_or_null.reference +++ b/tests/queries/0_stateless/01556_accurate_cast_or_null.reference @@ -32,6 +32,7 @@ \N \N \N +\N 2023-05-30 14:38:20 1970-01-01 00:00:19 \N diff --git a/tests/queries/0_stateless/01556_accurate_cast_or_null.sql b/tests/queries/0_stateless/01556_accurate_cast_or_null.sql index a9038a1d230..3f57358576e 100644 --- a/tests/queries/0_stateless/01556_accurate_cast_or_null.sql +++ b/tests/queries/0_stateless/01556_accurate_cast_or_null.sql @@ -37,9 +37,10 @@ SELECT accurateCastOrNull(nan, 'UInt256'); SELECT accurateCastOrNull(number + 127, 'Int8') AS x FROM numbers (2) ORDER BY x; SELECT accurateCastOrNull(-1, 'DateTime'); +SELECT accurateCastOrNull(5000000000, 'DateTime'); SELECT accurateCastOrNull('1xxx', 'DateTime'); -SELECT accurateCastOrNull('2023-05-30 14:38:20', 'DateTime'); -SELECT accurateCastOrNull(19, 'DateTime'); +select toString(accurateCastOrNull('2023-05-30 14:38:20', 'DateTime'), timezone()); +SELECT toString(accurateCastOrNull(19, 'DateTime'), 'UTC'); SELECT accurateCastOrNull(-1, 'Date'); SELECT accurateCastOrNull('1xxx', 'Date'); diff --git a/tests/queries/0_stateless/01601_accurate_cast.sql b/tests/queries/0_stateless/01601_accurate_cast.sql index 7611b1d96b9..f7f4d588ccc 100644 --- a/tests/queries/0_stateless/01601_accurate_cast.sql +++ b/tests/queries/0_stateless/01601_accurate_cast.sql @@ -24,11 +24,13 @@ SELECT accurateCast('123', 'FixedString(2)'); -- { serverError 131 } SELECT accurateCast('12', 'FixedString(2)'); SELECT accurateCast(-1, 'DateTime'); -- { serverError 70 } +SELECT accurateCast(5000000000, 'DateTime'); -- { serverError 70 } SELECT accurateCast('1xxx', 'DateTime'); -- { serverError 41 } SELECT accurateCast('2023-05-30 14:38:20', 'DateTime'); -SELECT accurateCast(19, 'DateTime'); +SELECT toString(accurateCast(19, 'DateTime'), 'UTC'); SELECT accurateCast(-1, 'Date'); -- { serverError 70 } +SELECT accurateCast(999999, 'Date'); -- { serverError 70 } SELECT accurateCast('1xxx', 'Date'); -- { serverError 38 } SELECT accurateCast('2023-05-30', 'Date'); SELECT accurateCast(19, 'Date'); From c7088a8180a245311885899e702e668719159123 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Fri, 2 Jun 2023 07:26:55 +0000 Subject: [PATCH 0290/1072] Trying to fix build --- src/Functions/FunctionsConversion.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index d5b5f6ae28a..6aa5843ff65 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -521,7 +521,7 @@ template struct ConvertImpl static bool CheckDateTimeRange(const FromType & value) { - return value >= 0 && value <= 0xFFFFFFFF; + return value >= 0 && value <= 0xFFFFFFFFL; } template From 16be379fa55c6dc8172004460799be93b7a52b88 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Fri, 2 Jun 2023 11:18:46 +0200 Subject: [PATCH 0291/1072] Update src/Common/AsynchronousMetrics.cpp --- src/Common/AsynchronousMetrics.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/AsynchronousMetrics.cpp b/src/Common/AsynchronousMetrics.cpp index a4cb18249b6..7064559800a 100644 --- a/src/Common/AsynchronousMetrics.cpp +++ b/src/Common/AsynchronousMetrics.cpp @@ -950,7 +950,7 @@ void AsynchronousMetrics::update(TimePoint update_time) auto space = line.find_first_of(" "); - if (line.rfind("max", 0) == std::string::npos) + if (line.rfind("max", space) == std::string::npos) { auto field1 = line.substr(0, space); quota = std::stoull(field1); From 57a2bfd0ff59f3ad4c5f6966c08ae7ddb6f0dda2 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Fri, 2 Jun 2023 11:19:11 +0200 Subject: [PATCH 0292/1072] Update src/Common/AsynchronousMetrics.cpp --- src/Common/AsynchronousMetrics.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Common/AsynchronousMetrics.cpp b/src/Common/AsynchronousMetrics.cpp index 7064559800a..cf8d451385b 100644 --- a/src/Common/AsynchronousMetrics.cpp +++ b/src/Common/AsynchronousMetrics.cpp @@ -956,8 +956,11 @@ void AsynchronousMetrics::update(TimePoint update_time) quota = std::stoull(field1); } - auto field2 = line.substr(space + 1); - period = std::stoull(field2); + if (space != std::string::npos) + { + auto field2 = line.substr(space + 1); + period = std::stoull(field2); + } new_values["CGroupCpuCfsPeriod"] = { period, "The CFS period of CPU cgroup."}; new_values["CGroupCpuCfsQuota"] = { quota, "The CFS quota of CPU cgroup. If stated zero, the quota is max."}; From 2f08b6738f307c7e04886a879632e1183b40b725 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 1 Jun 2023 18:34:00 +0200 Subject: [PATCH 0293/1072] Support parallel replicas with the analyzer --- src/Storages/StorageReplicatedMergeTree.cpp | 15 ++++-- ...02771_parallel_replicas_analyzer.reference | 12 +++++ .../02771_parallel_replicas_analyzer.sql | 52 +++++++++++++++++++ 3 files changed, 75 insertions(+), 4 deletions(-) create mode 100644 tests/queries/0_stateless/02771_parallel_replicas_analyzer.reference create mode 100644 tests/queries/0_stateless/02771_parallel_replicas_analyzer.sql diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 56896f88423..893e976d432 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -77,16 +77,17 @@ #include #include -#include #include +#include #include #include #include #include +#include #include +#include #include #include -#include #include @@ -4707,8 +4708,14 @@ void StorageReplicatedMergeTree::read( auto cluster = local_context->getCluster(local_context->getSettingsRef().cluster_for_parallel_replicas); - Block header = - InterpreterSelectQuery(modified_query_ast, local_context, SelectQueryOptions(processed_stage).analyze()).getSampleBlock(); + Block header; + + if (local_context->getSettingsRef().allow_experimental_analyzer) + header = InterpreterSelectQueryAnalyzer::getSampleBlock( + modified_query_ast, local_context, SelectQueryOptions(processed_stage).analyze()); + else + header + = InterpreterSelectQuery(modified_query_ast, local_context, SelectQueryOptions(processed_stage).analyze()).getSampleBlock(); ClusterProxy::SelectStreamFactory select_stream_factory = ClusterProxy::SelectStreamFactory( diff --git a/tests/queries/0_stateless/02771_parallel_replicas_analyzer.reference b/tests/queries/0_stateless/02771_parallel_replicas_analyzer.reference new file mode 100644 index 00000000000..4e93c530f7b --- /dev/null +++ b/tests/queries/0_stateless/02771_parallel_replicas_analyzer.reference @@ -0,0 +1,12 @@ +-8888150036649430454 +-2788931093724180887 +-75175454385331084 +368066018677693974 +821735343441964030 +2804162938822577320 +4357435422797280898 +5935810273536892891 +7885388429666205427 +8124171311239967992 +1 1 -- Simple query with analyzer and pure parallel replicas\nSELECT number\nFROM join_inner_table__fuzz_146_replicated\n SETTINGS\n allow_experimental_analyzer = 1,\n max_parallel_replicas = 2,\n cluster_for_parallel_replicas = \'test_cluster_one_shard_three_replicas_localhost\',\n allow_experimental_parallel_reading_from_replicas = 1,\n use_hedged_requests = 0; +0 2 SELECT `default`.`join_inner_table__fuzz_146_replicated`.`number` AS `number` FROM `default`.`join_inner_table__fuzz_146_replicated` diff --git a/tests/queries/0_stateless/02771_parallel_replicas_analyzer.sql b/tests/queries/0_stateless/02771_parallel_replicas_analyzer.sql new file mode 100644 index 00000000000..35089c0cedb --- /dev/null +++ b/tests/queries/0_stateless/02771_parallel_replicas_analyzer.sql @@ -0,0 +1,52 @@ +-- Tags: zookeeper + +CREATE TABLE join_inner_table__fuzz_146_replicated +( + `id` UUID, + `key` String, + `number` Int64, + `value1` String, + `value2` String, + `time` Nullable(Int64) +) +ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/join_inner_table__fuzz_146_replicated', '{replica}') +ORDER BY (id, number, key) +SETTINGS index_granularity = 8192; + +INSERT INTO join_inner_table__fuzz_146_replicated + SELECT CAST('833c9e22-c245-4eb5-8745-117a9a1f26b1', 'UUID') AS id, CAST(rowNumberInAllBlocks(), 'String') AS key, * + FROM generateRandom('number Int64, value1 String, value2 String, time Int64', 1, 10, 2) LIMIT 10; + +-- Simple query with analyzer and pure parallel replicas +SELECT number +FROM join_inner_table__fuzz_146_replicated + SETTINGS + allow_experimental_analyzer = 1, + max_parallel_replicas = 2, + cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', + allow_experimental_parallel_reading_from_replicas = 1, + use_hedged_requests = 0; + +SYSTEM FLUSH LOGS; +-- There should be 2 different queries +-- The initial query +-- The query sent to each replica (which should appear 2 times as we are setting max_parallel_replicas to 2) +SELECT + is_initial_query, + count() as c, query, +FROM system.query_log +WHERE + event_date >= yesterday() + AND type = 'QueryFinish' + AND initial_query_id = + ( + SELECT query_id + FROM system.query_log + WHERE + current_database = currentDatabase() + AND event_date >= yesterday() + AND type = 'QueryFinish' + AND query LIKE '-- Simple query with analyzer and pure parallel replicas%' + ) +GROUP BY is_initial_query, query +ORDER BY is_initial_query DESC, c, query; From f6da66cbb87d0c7c54c9b885cb5efeb0c330f801 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 1 Jun 2023 18:27:34 +0000 Subject: [PATCH 0294/1072] Fix some typos --- docs/en/sql-reference/functions/files.md | 8 +- .../functions/functions-for-nulls.md | 117 ++-- .../functions/logical-functions.md | 24 +- .../functions/other-functions.md | 549 +++++++----------- .../functions/random-functions.md | 181 ++++-- 5 files changed, 442 insertions(+), 437 deletions(-) diff --git a/docs/en/sql-reference/functions/files.md b/docs/en/sql-reference/functions/files.md index 5cd2d8e0a74..73d72aa50e5 100644 --- a/docs/en/sql-reference/functions/files.md +++ b/docs/en/sql-reference/functions/files.md @@ -6,7 +6,7 @@ sidebar_label: Files ## file -Reads file as string and loads the data into the specified column. The actual file content is not interpreted. +Reads a file as string and loads the data into the specified column. The file content is not interpreted. Also see table function [file](../table-functions/file.md). @@ -18,15 +18,13 @@ file(path[, default]) **Arguments** -- `path` — The path of the file relative to [user_files_path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Supports the following wildcards: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` are numbers and `'abc', 'def'` are strings. -- `default` — The value that will be returned in the case the file does not exist or cannot be accessed. Supported data types: [String](../../sql-reference/data-types/string.md) and [NULL](../../sql-reference/syntax.md#null-literal). +- `path` — The path of the file relative to [user_files_path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Supports wildcards `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` are numbers and `'abc', 'def'` are strings. +- `default` — The value returned if the file does not exist or cannot be accessed. Supported data types: [String](../../sql-reference/data-types/string.md) and [NULL](../../sql-reference/syntax.md#null-literal). **Example** Inserting data from files a.txt and b.txt into a table as strings: -Query: - ``` sql INSERT INTO table SELECT file('a.txt'), file('b.txt'); ``` diff --git a/docs/en/sql-reference/functions/functions-for-nulls.md b/docs/en/sql-reference/functions/functions-for-nulls.md index 0b7df54b776..f4ca27e9b16 100644 --- a/docs/en/sql-reference/functions/functions-for-nulls.md +++ b/docs/en/sql-reference/functions/functions-for-nulls.md @@ -8,7 +8,7 @@ sidebar_label: Nullable ## isNull -Checks whether the argument is [NULL](../../sql-reference/syntax.md#null-literal). +Returns whether the argument is [NULL](../../sql-reference/syntax.md#null-literal). ``` sql isNull(x) @@ -18,7 +18,7 @@ Alias: `ISNULL`. **Arguments** -- `x` — A value with a non-compound data type. +- `x` — A value of non-compound data type. **Returned value** @@ -27,7 +27,7 @@ Alias: `ISNULL`. **Example** -Input table +Table: ``` text ┌─x─┬────y─┐ @@ -36,12 +36,14 @@ Input table └───┴──────┘ ``` -Query +Query: ``` sql SELECT x FROM t_null WHERE isNull(y); ``` +Result: + ``` text ┌─x─┐ │ 1 │ @@ -50,7 +52,7 @@ SELECT x FROM t_null WHERE isNull(y); ## isNotNull -Checks whether the argument is [NULL](../../sql-reference/syntax.md#null-literal). +Returns whether the argument is not [NULL](../../sql-reference/syntax.md#null-literal). ``` sql isNotNull(x) @@ -58,16 +60,16 @@ isNotNull(x) **Arguments:** -- `x` — A value with a non-compound data type. +- `x` — A value of non-compound data type. **Returned value** -- `0` if `x` is `NULL`. - `1` if `x` is not `NULL`. +- `0` if `x` is `NULL`. **Example** -Input table +Table: ``` text ┌─x─┬────y─┐ @@ -76,12 +78,14 @@ Input table └───┴──────┘ ``` -Query +Query: ``` sql SELECT x FROM t_null WHERE isNotNull(y); ``` +Result: + ``` text ┌─x─┐ │ 2 │ @@ -90,7 +94,7 @@ SELECT x FROM t_null WHERE isNotNull(y); ## coalesce -Checks from left to right whether `NULL` arguments were passed and returns the first non-`NULL` argument. +Returns the leftmost non-`NULL` argument. ``` sql coalesce(x,...) @@ -98,11 +102,11 @@ coalesce(x,...) **Arguments:** -- Any number of parameters of a non-compound type. All parameters must be compatible by data type. +- Any number of parameters of non-compound type. All parameters must be of mutually compatible data types. **Returned values** -- The first non-`NULL` argument. +- The first non-`NULL` argument - `NULL`, if all arguments are `NULL`. **Example** @@ -110,10 +114,10 @@ coalesce(x,...) Consider a list of contacts that may specify multiple ways to contact a customer. ``` text -┌─name─────┬─mail─┬─phone─────┬──icq─┐ -│ client 1 │ ᴺᵁᴸᴸ │ 123-45-67 │ 123 │ -│ client 2 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -└──────────┴──────┴───────────┴──────┘ +┌─name─────┬─mail─┬─phone─────┬──telegram─┐ +│ client 1 │ ᴺᵁᴸᴸ │ 123-45-67 │ 123 │ +│ client 2 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ +└──────────┴──────┴───────────┴───────────┘ ``` The `mail` and `phone` fields are of type String, but the `icq` field is `UInt32`, so it needs to be converted to `String`. @@ -121,22 +125,22 @@ The `mail` and `phone` fields are of type String, but the `icq` field is `UInt32 Get the first available contact method for the customer from the contact list: ``` sql -SELECT name, coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook; +SELECT name, coalesce(mail, phone, CAST(telegram,'Nullable(String)')) FROM aBook; ``` ``` text -┌─name─────┬─coalesce(mail, phone, CAST(icq, 'Nullable(String)'))─┐ -│ client 1 │ 123-45-67 │ -│ client 2 │ ᴺᵁᴸᴸ │ -└──────────┴──────────────────────────────────────────────────────┘ +┌─name─────┬─coalesce(mail, phone, CAST(telegram, 'Nullable(String)'))─┐ +│ client 1 │ 123-45-67 │ +│ client 2 │ ᴺᵁᴸᴸ │ +└──────────┴───────────────────────────────────────────────────────────┘ ``` ## ifNull -Returns an alternative value if the main argument is `NULL`. +Returns an alternative value if the argument is `NULL`. ``` sql -ifNull(x,alt) +ifNull(x, alt) ``` **Arguments:** @@ -146,25 +150,33 @@ ifNull(x,alt) **Returned values** -- The value `x`, if `x` is not `NULL`. -- The value `alt`, if `x` is `NULL`. +- `x` if `x` is not `NULL`. +- `alt` if `x` is `NULL`. **Example** +Query: + ``` sql SELECT ifNull('a', 'b'); ``` +Result: + ``` text ┌─ifNull('a', 'b')─┐ │ a │ └──────────────────┘ ``` +Query: + ``` sql SELECT ifNull(NULL, 'b'); ``` +Result: + ``` text ┌─ifNull(NULL, 'b')─┐ │ b │ @@ -173,7 +185,7 @@ SELECT ifNull(NULL, 'b'); ## nullIf -Returns `NULL` if the arguments are equal. +Returns `NULL` if both arguments are equal. ``` sql nullIf(x, y) @@ -181,29 +193,37 @@ nullIf(x, y) **Arguments:** -`x`, `y` — Values for comparison. They must be compatible types, or ClickHouse will generate an exception. +`x`, `y` — Values to compare. Must be of compatible types. **Returned values** -- `NULL`, if the arguments are equal. -- The `x` value, if the arguments are not equal. +- `NULL` if the arguments are equal. +- `x` if the arguments are not equal. **Example** +Query: + ``` sql SELECT nullIf(1, 1); ``` +Result: + ``` text ┌─nullIf(1, 1)─┐ │ ᴺᵁᴸᴸ │ └──────────────┘ ``` +Query: + ``` sql SELECT nullIf(1, 2); ``` +Result: + ``` text ┌─nullIf(1, 2)─┐ │ 1 │ @@ -212,7 +232,7 @@ SELECT nullIf(1, 2); ## assumeNotNull -Results in an equivalent non-`Nullable` value for a [Nullable](../../sql-reference/data-types/nullable.md) type. In case the original value is `NULL` the result is undetermined. See also `ifNull` and `coalesce` functions. +Returns the corresponding non-`Nullable` value for a value of [Nullable](../../sql-reference/data-types/nullable.md) type. If the original value is `NULL`, an arbitrary result can be returned. See also functions `ifNull` and `coalesce`. ``` sql assumeNotNull(x) @@ -224,36 +244,29 @@ assumeNotNull(x) **Returned values** -- The original value from the non-`Nullable` type, if it is not `NULL`. -- Implementation specific result if the original value was `NULL`. +- The input value as non-`Nullable` type, if it is not `NULL`. +- An arbirary value, if the input value is `NULL`. **Example** -Consider the `t_null` table. - -``` sql -SHOW CREATE TABLE t_null; -``` +Table: ``` text -┌─statement─────────────────────────────────────────────────────────────────┐ -│ CREATE TABLE default.t_null ( x Int8, y Nullable(Int8)) ENGINE = TinyLog │ -└───────────────────────────────────────────────────────────────────────────┘ -``` -``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ └───┴──────┘ ``` -Apply the `assumeNotNull` function to the `y` column. +Query: ``` sql -SELECT assumeNotNull(y) FROM t_null; +SELECT assumeNotNull(y) FROM table; ``` +Result: + ``` text ┌─assumeNotNull(y)─┐ │ 0 │ @@ -261,10 +274,14 @@ SELECT assumeNotNull(y) FROM t_null; └──────────────────┘ ``` +Query: + ``` sql SELECT toTypeName(assumeNotNull(y)) FROM t_null; ``` +Result: + ``` text ┌─toTypeName(assumeNotNull(y))─┐ │ Int8 │ @@ -282,28 +299,36 @@ toNullable(x) **Arguments:** -- `x` — The value of any non-compound type. +- `x` — A value of non-compound type. **Returned value** -- The input value with a `Nullable` type. +- The input value but of `Nullable` type. **Example** +Query: + ``` sql SELECT toTypeName(10); ``` +Result: + ``` text ┌─toTypeName(10)─┐ │ UInt8 │ └────────────────┘ ``` +Query: + ``` sql SELECT toTypeName(toNullable(10)); ``` +Result: + ``` text ┌─toTypeName(toNullable(10))─┐ │ Nullable(UInt8) │ diff --git a/docs/en/sql-reference/functions/logical-functions.md b/docs/en/sql-reference/functions/logical-functions.md index f5a1a6aac12..2e1a8f28227 100644 --- a/docs/en/sql-reference/functions/logical-functions.md +++ b/docs/en/sql-reference/functions/logical-functions.md @@ -12,7 +12,9 @@ Zero as an argument is considered `false`, non-zero values are considered `true` ## and -Calculates the logical conjunction between two or more values. +Calculates the logical conjunction of two or more values. + +Setting [short_circuit_function_evaluation](../../operations/settings/settings.md#short-circuit-function-evaluation) controls whether short-circuit evaluation is used. If enabled, `val_i` is evaluated only if `(val_1 AND val_2 AND ... AND val_{i-1})` is `true`. For example, with short-circuit evaluation, no division-by-zero exception is thrown when executing the query `SELECT and(number = 2, intDiv(1, number)) FROM numbers(5)`. **Syntax** @@ -20,9 +22,7 @@ Calculates the logical conjunction between two or more values. and(val1, val2...) ``` -Setting [short_circuit_function_evaluation](../../operations/settings/settings.md#short-circuit-function-evaluation) controls whether short-circuit evaluation is used. If enabled, `val_i` is evaluated only if `(val_1 AND val_2 AND ... AND val_{i-1})` is `true`. For example, with short-circuit evaluation, no division-by-zero exception is thrown when executing the query `SELECT and(number = 2, intDiv(1, number)) FROM numbers(5)`. - -Alias: The [AND Operator](../../sql-reference/operators/index.md#logical-and-operator). +Alias: The [AND operator](../../sql-reference/operators/index.md#logical-and-operator). **Arguments** @@ -30,7 +30,7 @@ Alias: The [AND Operator](../../sql-reference/operators/index.md#logical-and-ope **Returned value** -- `0`, if there at least one argument evaluates to `false`, +- `0`, if at least one argument evaluates to `false`, - `NULL`, if no argumetn evaluates to `false` and at least one argument is `NULL`, - `1`, otherwise. @@ -66,7 +66,9 @@ Result: ## or -Calculates the logical disjunction between two or more values. +Calculates the logical disjunction of two or more values. + +Setting [short_circuit_function_evaluation](../../operations/settings/settings.md#short-circuit-function-evaluation) controls whether short-circuit evaluation is used. If enabled, `val_i` is evaluated only if `((NOT val_1) AND (NOT val_2) AND ... AND (NOT val_{i-1}))` is `true`. For example, with short-circuit evaluation, no division-by-zero exception is thrown when executing the query `SELECT or(number = 0, intDiv(1, number) != 0) FROM numbers(5)`. **Syntax** @@ -74,9 +76,7 @@ Calculates the logical disjunction between two or more values. or(val1, val2...) ``` -Setting [short_circuit_function_evaluation](../../operations/settings/settings.md#short-circuit-function-evaluation) controls whether short-circuit evaluation is used. If enabled, `val_i` is evaluated only if `((NOT val_1) AND (NOT val_2) AND ... AND (NOT val_{i-1}))` is `true`. For example, with short-circuit evaluation, no division-by-zero exception is thrown when executing the query `SELECT or(number = 0, intDiv(1, number) != 0) FROM numbers(5)`. - -Alias: The [OR Operator](../../sql-reference/operators/index.md#logical-or-operator). +Alias: The [OR operator](../../sql-reference/operators/index.md#logical-or-operator). **Arguments** @@ -120,7 +120,7 @@ Result: ## not -Calculates logical negation of a value. +Calculates the logical negation of a value. **Syntax** @@ -128,7 +128,7 @@ Calculates logical negation of a value. not(val); ``` -Alias: The [Negation Operator](../../sql-reference/operators/index.md#logical-negation-operator). +Alias: The [Negation operator](../../sql-reference/operators/index.md#logical-negation-operator). **Arguments** @@ -158,7 +158,7 @@ Result: ## xor -Calculates the logical exclusive disjunction between two or more values. For more than two values the function first xor-s the first two values, then xor-s the result with the third value etc. +Calculates the logical exclusive disjunction of two or more values. For more than two input values, the function first xor-s the first two values, then xor-s the result with the third value etc. **Syntax** diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index efe1a77c285..037eb9e63c5 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -8,12 +8,12 @@ sidebar_label: Other ## hostName() -Returns a string with the name of the host that this function was performed on. For distributed processing, this is the name of the remote server host, if the function is performed on a remote server. -If it is executed in the context of a distributed table, then it generates a normal column with values relevant to each shard. Otherwise it produces a constant value. +Returns the name of the host on which this function was executed. If the function executes on a remote server (distributed processing), the remote server name is returned. +If the function executes in the context of a distributed table, it generates a normal column with values relevant to each shard. Otherwise it produces a constant value. ## getMacro -Gets a named value from the [macros](../../operations/server-configuration-parameters/settings.md#macros) section of the server configuration. +Returns a named value from the [macros](../../operations/server-configuration-parameters/settings.md#macros) section of the server configuration. **Syntax** @@ -23,7 +23,7 @@ getMacro(name); **Arguments** -- `name` — Name to retrieve from the `macros` section. [String](../../sql-reference/data-types/string.md#string). +- `name` — Macro name to retrieve from the `` section. [String](../../sql-reference/data-types/string.md#string). **Returned value** @@ -33,7 +33,7 @@ Type: [String](../../sql-reference/data-types/string.md). **Example** -The example `macros` section in the server configuration file: +Example `` section in the server configuration file: ``` xml @@ -55,7 +55,7 @@ Result: └──────────────────┘ ``` -An alternative way to get the same value: +The same value can be retrieved as follows: ``` sql SELECT * FROM system.macros @@ -70,7 +70,7 @@ WHERE macro = 'test'; ## FQDN -Returns the fully qualified domain name. +Returns the fully qualified domain name of the ClickHouse server. **Syntax** @@ -88,8 +88,6 @@ Type: `String`. **Example** -Query: - ``` sql SELECT FQDN(); ``` @@ -104,52 +102,61 @@ Result: ## basename -Extracts the trailing part of a string after the last slash or backslash. This function if often used to extract the filename from a path. +Extracts the tail of a string following its last slash or backslash. This function if often used to extract the filename from a path. ``` sql -basename( expr ) +basename(expr) ``` **Arguments** -- `expr` — Expression resulting in a [String](../../sql-reference/data-types/string.md) type value. All the backslashes must be escaped in the resulting value. +- `expr` — A value of type [String](../../sql-reference/data-types/string.md). Backslashes must be escaped. **Returned Value** A string that contains: -- The trailing part of a string after the last slash or backslash. - - If the input string contains a path ending with slash or backslash, for example, `/` or `c:\`, the function returns an empty string. - +- The tail of the input string after its last slash or backslash. If the input string ends with a slash or backslash (e.g. `/` or `c:\`), the function returns an empty string. - The original string if there are no slashes or backslashes. **Example** +Query: + ``` sql SELECT 'some/long/path/to/file' AS a, basename(a) ``` +Result: + ``` text ┌─a──────────────────────┬─basename('some\\long\\path\\to\\file')─┐ │ some\long\path\to\file │ file │ └────────────────────────┴────────────────────────────────────────┘ ``` +Query: + ``` sql SELECT 'some\\long\\path\\to\\file' AS a, basename(a) ``` +Result: + ``` text ┌─a──────────────────────┬─basename('some\\long\\path\\to\\file')─┐ │ some\long\path\to\file │ file │ └────────────────────────┴────────────────────────────────────────┘ ``` +Query: + ``` sql SELECT 'some-file-name' AS a, basename(a) ``` +Result: + ``` text ┌─a──────────────┬─basename('some-file-name')─┐ │ some-file-name │ some-file-name │ @@ -159,7 +166,7 @@ SELECT 'some-file-name' AS a, basename(a) ## visibleWidth(x) Calculates the approximate width when outputting values to the console in text format (tab-separated). -This function is used by the system for implementing Pretty formats. +This function is used by the system to implement Pretty formats. `NULL` is represented as a string corresponding to `NULL` in `Pretty` formats. @@ -175,18 +182,18 @@ SELECT visibleWidth(NULL) ## toTypeName(x) -Returns a string containing the type name of the passed argument. +Returns the type name of the passed argument. -If `NULL` is passed to the function as input, then it returns the `Nullable(Nothing)` type, which corresponds to an internal `NULL` representation in ClickHouse. +If `NULL` is passed, then the function returns type `Nullable(Nothing)`, which corresponds to ClickHouse's internal `NULL` representation. ## blockSize() -Gets the size of the block. -In ClickHouse, queries are always run on blocks (sets of column parts). This function allows getting the size of the block that you called it for. +In ClickHouse, queries are processed in blocks (chunks). +This function returns the size (row count) of the block the function is called on. ## byteSize -Returns estimation of uncompressed byte size of its arguments in memory. +Returns an estimation of uncompressed byte size of its arguments in memory. **Syntax** @@ -206,7 +213,7 @@ Type: [UInt64](../../sql-reference/data-types/int-uint.md). **Examples** -For [String](../../sql-reference/data-types/string.md) arguments the funtion returns the string length + 9 (terminating zero + length). +For [String](../../sql-reference/data-types/string.md) arguments, the funtion returns the string length + 9 (terminating zero + length). Query: @@ -265,7 +272,7 @@ byteSize(Float32): 4 byteSize(Float64): 8 ``` -If the function takes multiple arguments, it returns their combined byte size. +If the function has multiple arguments, the function accumulates their byte sizes. Query: @@ -283,30 +290,30 @@ Result: ## materialize(x) -Turns a constant into a full column containing just one value. -In ClickHouse, full columns and constants are represented differently in memory. Functions work differently for constant arguments and normal arguments (different code is executed), although the result is almost always the same. This function is for debugging this behavior. +Turns a constant into a full column containing a single value. +Full columns and constants are represented differently in memory. Functions usually execute different code for normal and constant arguments, although the result should typically be the same. This function can be used to debug this behavior. ## ignore(…) -Accepts any arguments, including `NULL`. Always returns 0. -However, the argument is still evaluated. This can be used for benchmarks. +Accepts any arguments, including `NULL` and does nothing. Always returns 0. +The argument is internally still evaluated. Useful e.g. for benchmarks. ## sleep(seconds) -Sleeps ‘seconds’ seconds on each data block. You can specify an integer or a floating-point number. +Sleeps ‘seconds’ seconds for each data block. The sleep time can be specified as integer or as floating-point number. ## sleepEachRow(seconds) -Sleeps ‘seconds’ seconds on each row. You can specify an integer or a floating-point number. +Sleeps ‘seconds’ seconds for each row. The sleep time can be specified as integer or as floating-point number. ## currentDatabase() Returns the name of the current database. -You can use this function in table engine parameters in a CREATE TABLE query where you need to specify the database. +Useful in table engine parameters of `CREATE TABLE` queries where you need to specify the database. ## currentUser() -Returns the login of current user. Login of user, that initiated query, will be returned in case distributed query. +Returns the name of the current user. In case of a distributed query, the name of the user who initiated the query is returned. ``` sql SELECT currentUser(); @@ -316,15 +323,13 @@ Alias: `user()`, `USER()`. **Returned values** -- Login of current user. -- Login of user that initiated query in case of distributed query. +- The name of the current user. +- In distributed queries, the login of the user who initiated the query. Type: `String`. **Example** -Query: - ``` sql SELECT currentUser(); ``` @@ -339,11 +344,11 @@ Result: ## isConstant -Checks whether the argument is a constant expression. +Returns whether the argument is a constant expression. -A constant expression means an expression whose resulting value is known at the query analysis (i.e. before execution). For example, expressions over [literals](../../sql-reference/syntax.md#literals) are constant expressions. +A constant expression is an expression whose result is known during query analysis, i.e. before execution. For example, expressions over [literals](../../sql-reference/syntax.md#literals) are constant expressions. -The function is intended for development, debugging and demonstration. +This function is mostly intended for development, debugging and demonstration. **Syntax** @@ -357,8 +362,8 @@ isConstant(x) **Returned values** -- `1` — `x` is constant. -- `0` — `x` is non-constant. +- `1` if `x` is constant. +- `0` if `x` is non-constant. Type: [UInt8](../../sql-reference/data-types/int-uint.md). @@ -408,23 +413,25 @@ Result: ## isFinite(x) -Accepts Float32 and Float64 and returns UInt8 equal to 1 if the argument is not infinite and not a NaN, otherwise 0. +Returns 1 if the Float32 or Float64 argument not infinite and not a NaN, otherwise this function returns 0. ## isInfinite(x) -Accepts Float32 and Float64 and returns UInt8 equal to 1 if the argument is infinite, otherwise 0. Note that 0 is returned for a NaN. +Returns 1 if the Float32 or Float64 argument is infinite, otherwise this function returns 0. Note that 0 is returned for a NaN. ## ifNotFinite -Checks whether floating point value is finite. +Checks whether a floating point value is finite. **Syntax** - ifNotFinite(x,y) +``` sql +ifNotFinite(x,y) +``` **Arguments** -- `x` — Value to be checked for infinity. Type: [Float\*](../../sql-reference/data-types/float.md). +- `x` — Value to check for infinity. Type: [Float\*](../../sql-reference/data-types/float.md). - `y` — Fallback value. Type: [Float\*](../../sql-reference/data-types/float.md). **Returned value** @@ -444,23 +451,23 @@ Result: │ inf │ 42 │ └─────────┴───────────────────────────────┘ -You can get similar result by using [ternary operator](../../sql-reference/functions/conditional-functions.md#ternary-operator): `isFinite(x) ? x : y`. +You can get similar result by using the [ternary operator](../../sql-reference/functions/conditional-functions.md#ternary-operator): `isFinite(x) ? x : y`. ## isNaN(x) -Accepts Float32 and Float64 and returns UInt8 equal to 1 if the argument is a NaN, otherwise 0. +Returns 1 if the Float32 and Float64 argument is NaN, otherwise this function 0. ## hasColumnInTable(\[‘hostname’\[, ‘username’\[, ‘password’\]\],\] ‘database’, ‘table’, ‘column’) -Accepts constant strings: database name, table name, and column name. Returns a UInt8 constant expression equal to 1 if there is a column, otherwise 0. If the hostname parameter is set, the test will run on a remote server. -The function throws an exception if the table does not exist. +Given the database name, the table name, and the column name as constant strings, returns 1 if the given column exists, otherwise 0. If parameter `hostname` is given, the check is performed on a remote server. +If the table does not exist, an exception is thrown. For elements in a nested data structure, the function checks for the existence of a column. For the nested data structure itself, the function returns 0. ## bar -Allows building a unicode-art diagram. +Builds a bar chart. -`bar(x, min, max, width)` draws a band with a width proportional to `(x - min)` and equal to `width` characters when `x = max`. +`bar(x, min, max, width)` draws a band with width proportional to `(x - min)` and equal to `width` characters when `x = max`. **Arguments** @@ -520,23 +527,23 @@ There are two variations of this function: `x` – What to transform. -`array_from` – Constant array of values for converting. +`array_from` – Constant array of values to convert. `array_to` – Constant array of values to convert the values in ‘from’ to. `default` – Which value to use if ‘x’ is not equal to any of the values in ‘from’. -`array_from` and `array_to` – Arrays of the same size. +`array_from` and `array_to` must have equally many elements. -Types: +Signature: + +For `x` equal to one of the elements in `array_from`, the function returns the corresponding element in `array_to`, i.e. the one at the same array index. Otherwise, it returns `default`. If multiple matching elements exist `array_from`, an arbitrary corresponding element from `array_to` is returned. `transform(T, Array(T), Array(U), U) -> U` `T` and `U` can be numeric, string, or Date or DateTime types. -Where the same letter is indicated (T or U), for numeric types these might not be matching types, but types that have a common type. -For example, the first argument can have the Int64 type, while the second has the Array(UInt16) type. - -If the ‘x’ value is equal to one of the elements in the ‘array_from’ array, it returns the existing element (that is numbered the same) from the ‘array_to’ array. Otherwise, it returns ‘default’. If there are multiple matching elements in ‘array_from’, it returns one of the matches. +The same letter (T or U) means that types must be mutually compatible and not necessarily equal. +For example, the first argument could have type `Int64`, while the second argument could have type `Array(UInt16)`. Example: @@ -560,12 +567,7 @@ ORDER BY c DESC ### transform(x, array_from, array_to) -Differs from the first variation in that the ‘default’ argument is omitted. -If the ‘x’ value is equal to one of the elements in the ‘array_from’ array, it returns the matching element (that is numbered the same) from the ‘array_to’ array. Otherwise, it returns ‘x’. - -Types: - -`transform(T, Array(T), Array(T)) -> T` +Similar to the other variation but has no ‘default’ argument. In case no match can be found, `x` is returned. Example: @@ -595,7 +597,7 @@ LIMIT 10 ## formatReadableDecimalSize(x) -Accepts the size (number of bytes). Returns a rounded size with a suffix (KB, MB, etc.) as a string. +Given a size (number of bytes), this function returns a readable, rounded size with suffix (KB, MB, etc.) as string. Example: @@ -616,7 +618,7 @@ SELECT ## formatReadableSize(x) -Accepts the size (number of bytes). Returns a rounded size with a suffix (KiB, MiB, etc.) as a string. +Given a size (number of bytes), this function returns a readable, rounded size with suffix (KiB, MiB, etc.) as string. Example: @@ -637,9 +639,7 @@ SELECT ## formatReadableQuantity(x) -Accepts the number. Returns a rounded number with a suffix (thousand, million, billion, etc.) as a string. - -It is useful for reading big numbers by human. +Given a number, this function returns a rounded number with suffix (thousand, million, billion, etc.) as string. Example: @@ -660,7 +660,7 @@ SELECT ## formatReadableTimeDelta -Accepts the time delta in seconds. Returns a time delta with (year, month, day, hour, minute, second) as a string. +Given a time interval (delta) in seconds, this function returns a time delta with year/month/day/hour/minute/second as string. **Syntax** @@ -670,8 +670,8 @@ formatReadableTimeDelta(column[, maximum_unit]) **Arguments** -- `column` — A column with numeric time delta. -- `maximum_unit` — Optional. Maximum unit to show. Acceptable values seconds, minutes, hours, days, months, years. +- `column` — A column with a numeric time delta. +- `maximum_unit` — Optional. Maximum unit to show. Acceptable values `seconds`, `minutes`, `hours`, `days`, `months`, `years`. Example: @@ -746,33 +746,32 @@ SELECT parseTimeDelta('1yr2mo') ## least(a, b) -Returns the smallest value from a and b. +Returns the smaller value of a and b. ## greatest(a, b) -Returns the largest value of a and b. +Returns the larger value of a and b. ## uptime() Returns the server’s uptime in seconds. -If it is executed in the context of a distributed table, then it generates a normal column with values relevant to each shard. Otherwise it produces a constant value. +If executed in the context of a distributed table, this function generates a normal column with values relevant to each shard. Otherwise it produces a constant value. ## version() -Returns the version of the server as a string. -If it is executed in the context of a distributed table, then it generates a normal column with values relevant to each shard. Otherwise it produces a constant value. +Returns the server version as a string. +If executed in the context of a distributed table, this function generates a normal column with values relevant to each shard. Otherwise it produces a constant value. ## buildId() Returns the build ID generated by a compiler for the running ClickHouse server binary. -If it is executed in the context of a distributed table, then it generates a normal column with values relevant to each shard. Otherwise it produces a constant value. +If executed in the context of a distributed table, this function generates a normal column with values relevant to each shard. Otherwise it produces a constant value. - -## blockNumber +## blockNumber() Returns the sequence number of the data block where the row is located. -## rowNumberInBlock +## rowNumberInBlock() Returns the ordinal number of the row in the data block. Different data blocks are always recalculated. @@ -782,7 +781,7 @@ Returns the ordinal number of the row in the data block. This function only cons ## neighbor -The window function that provides access to a row at a specified offset which comes before or after the current row of a given column. +The window function that provides access to a row at a specified offset before or after the current row of a given column. **Syntax** @@ -792,23 +791,23 @@ neighbor(column, offset[, default_value]) The result of the function depends on the affected data blocks and the order of data in the block. -:::tip -It can reach the neighbor rows only inside the currently processed data block. +:::note +Only returns neighbor inside the currently processed data block. ::: -The rows order used during the calculation of `neighbor` can differ from the order of rows returned to the user. -To prevent that you can make a subquery with [ORDER BY](../../sql-reference/statements/select/order-by.md) and call the function from outside the subquery. +The order of rows during calculation of `neighbor()` can differ from the order of rows returned to the user. +To prevent that you can create a subquery with [ORDER BY](../../sql-reference/statements/select/order-by.md) and call the function from outside the subquery. **Arguments** - `column` — A column name or scalar expression. -- `offset` — The number of rows forwards or backwards from the current row of `column`. [Int64](../../sql-reference/data-types/int-uint.md). -- `default_value` — Optional. The value to be returned if offset goes beyond the scope of the block. Type of data blocks affected. +- `offset` — The number of rows to look before or ahead of the current row in `column`. [Int64](../../sql-reference/data-types/int-uint.md). +- `default_value` — Optional. The returned value if offset is beyond the block boundaries. Type of data blocks affected. **Returned values** -- Value for `column` in `offset` distance from current row if `offset` value is not outside block bounds. -- Default value for `column` if `offset` value is outside block bounds. If `default_value` is given, then it will be used. +- Value of `column` with `offset` distance from current row, if `offset` is not outside the block boundaries. +- The default value of `column` or `default_value` (if given), if `offset` is outside the block boundaries. Type: type of data blocks affected or default value type. @@ -899,17 +898,17 @@ Result: ## runningDifference(x) -Calculates the difference between successive row values ​​in the data block. -Returns 0 for the first row and the difference from the previous row for each subsequent row. +Calculates the difference between two consecutive row values in the data block. +Returns 0 for the first row, and for subsequent rows the difference to the previous row. -:::tip -It can reach the previous row only inside the currently processed data block. +:::note +Only returns differences inside the currently processed data block. ::: The result of the function depends on the affected data blocks and the order of data in the block. -The rows order used during the calculation of `runningDifference` can differ from the order of rows returned to the user. -To prevent that you can make a subquery with [ORDER BY](../../sql-reference/statements/select/order-by.md) and call the function from outside the subquery. +The order of rows during calculation of `runningDifference()` can differ from the order of rows returned to the user. +To prevent that you can create a subquery with [ORDER BY](../../sql-reference/statements/select/order-by.md) and call the function from outside the subquery. Example: @@ -940,7 +939,7 @@ FROM └─────────┴─────────────────────┴───────┘ ``` -Please note - block size affects the result. With each new block, the `runningDifference` state is reset. +Please note that the block size affects the result. The internal state of `runningDifference` state is reset for each new block. ``` sql SELECT @@ -977,7 +976,7 @@ WHERE diff != 1 ## runningDifferenceStartingWithFirstValue -Same as for [runningDifference](./other-functions.md#other_functions-runningdifference), the difference is the value of the first row, returned the value of the first row, and each subsequent row returns the difference from the previous row. +Same as [runningDifference](./other-functions.md#other_functions-runningdifference), but returns the value of the first row as the value on the first row. ## runningConcurrency @@ -1039,7 +1038,7 @@ Result: ## MACNumToString(num) -Accepts a UInt64 number. Interprets it as a MAC address in big endian. Returns a string containing the corresponding MAC address in the format AA:BB:CC:DD:EE:FF (colon-separated numbers in hexadecimal form). +Interprets a UInt64 number as a MAC address in big endian format. Returns the corresponding MAC address in format AA:BB:CC:DD:EE:FF (colon-separated numbers in hexadecimal form) as string. ## MACStringToNum(s) @@ -1047,11 +1046,12 @@ The inverse function of MACNumToString. If the MAC address has an invalid format ## MACStringToOUI(s) -Accepts a MAC address in the format AA:BB:CC:DD:EE:FF (colon-separated numbers in hexadecimal form). Returns the first three octets as a UInt64 number. If the MAC address has an invalid format, it returns 0. +Given a MAC address in format AA:BB:CC:DD:EE:FF (colon-separated numbers in hexadecimal form), returns the first three octets as a UInt64 number. If the MAC address has an invalid format, it returns 0. ## getSizeOfEnumType Returns the number of fields in [Enum](../../sql-reference/data-types/enum.md). +An exception is thrown if the type is not `Enum`. ``` sql getSizeOfEnumType(value) @@ -1064,7 +1064,6 @@ getSizeOfEnumType(value) **Returned values** - The number of fields with `Enum` input values. -- An exception is thrown if the type is not `Enum`. **Example** @@ -1080,7 +1079,7 @@ SELECT getSizeOfEnumType( CAST('a' AS Enum8('a' = 1, 'b' = 2) ) ) AS x ## blockSerializedSize -Returns size on disk (without taking into account compression). +Returns the size on disk without considering compression. ``` sql blockSerializedSize(value[, value[, ...]]) @@ -1092,7 +1091,7 @@ blockSerializedSize(value[, value[, ...]]) **Returned values** -- The number of bytes that will be written to disk for block of values (without compression). +- The number of bytes that will be written to disk for block of values without compression. **Example** @@ -1112,7 +1111,7 @@ Result: ## toColumnTypeName -Returns the name of the class that represents the data type of the column in RAM. +Returns the internal name of the data type that represents the value. ``` sql toColumnTypeName(value) @@ -1124,31 +1123,39 @@ toColumnTypeName(value) **Returned values** -- A string with the name of the class that is used for representing the `value` data type in RAM. +- The internal data type name used to represent `value`. -**Example of the difference between`toTypeName ' and ' toColumnTypeName`** +**Example** + +Difference between `toTypeName ' and ' toColumnTypeName`: ``` sql SELECT toTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) ``` +Result: + ``` text ┌─toTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ │ DateTime │ └─────────────────────────────────────────────────────┘ ``` +Query: + ``` sql SELECT toColumnTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) ``` +Result: + ``` text ┌─toColumnTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ │ Const(UInt32) │ └───────────────────────────────────────────────────────────┘ ``` -The example shows that the `DateTime` data type is stored in memory as `Const(UInt32)`. +The example shows that the `DateTime` data type is internally stored as `Const(UInt32)`. ## dumpColumnStructure @@ -1164,7 +1171,7 @@ dumpColumnStructure(value) **Returned values** -- A string describing the structure that is used for representing the `value` data type in RAM. +- A description of the column structure used for representing `value`. **Example** @@ -1180,7 +1187,7 @@ SELECT dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime')) ## defaultValueOfArgumentType -Outputs the default value for the data type. +Returns the default value for the given data type. Does not include default values for custom columns set by the user. @@ -1200,20 +1207,28 @@ defaultValueOfArgumentType(expression) **Example** +Query: + ``` sql SELECT defaultValueOfArgumentType( CAST(1 AS Int8) ) ``` +Result: + ``` text ┌─defaultValueOfArgumentType(CAST(1, 'Int8'))─┐ │ 0 │ └─────────────────────────────────────────────┘ ``` +Query: + ``` sql SELECT defaultValueOfArgumentType( CAST(1 AS Nullable(Int8) ) ) ``` +Result: + ``` text ┌─defaultValueOfArgumentType(CAST(1, 'Nullable(Int8)'))─┐ │ ᴺᵁᴸᴸ │ @@ -1222,7 +1237,7 @@ SELECT defaultValueOfArgumentType( CAST(1 AS Nullable(Int8) ) ) ## defaultValueOfTypeName -Outputs the default value for given type name. +Returns the default value for the given type name. Does not include default values for custom columns set by the user. @@ -1242,20 +1257,28 @@ defaultValueOfTypeName(type) **Example** +Query: + ``` sql SELECT defaultValueOfTypeName('Int8') ``` +Result: + ``` text ┌─defaultValueOfTypeName('Int8')─┐ │ 0 │ └────────────────────────────────┘ ``` +Query: + ``` sql SELECT defaultValueOfTypeName('Nullable(Int8)') ``` +Result: + ``` text ┌─defaultValueOfTypeName('Nullable(Int8)')─┐ │ ᴺᵁᴸᴸ │ @@ -1263,9 +1286,10 @@ SELECT defaultValueOfTypeName('Nullable(Int8)') ``` ## indexHint -The function is intended for debugging and introspection purposes. The function ignores it's argument and always returns 1. Arguments are not even evaluated. -But for the purpose of index analysis, the argument of this function is analyzed as if it was present directly without being wrapped inside `indexHint` function. This allows to select data in index ranges by the corresponding condition but without further filtering by this condition. The index in ClickHouse is sparse and using `indexHint` will yield more data than specifying the same condition directly. +This function is intended for debugging and introspection. It ignores its argument and always returns 1. The arguments are not evaluated. + +But during index analysis, the argument of this function is assumed to be not wrapped in `indexHint`. This allows to select data in index ranges by the corresponding condition but without further filtering by this condition. The index in ClickHouse is sparse and using `indexHint` will yield more data than specifying the same condition directly. **Syntax** @@ -1275,13 +1299,13 @@ SELECT * FROM table WHERE indexHint() **Returned value** -1. Type: [Uint8](https://clickhouse.com/docs/en/data_types/int_uint/#diapazony-uint). +Type: [Uint8](https://clickhouse.com/docs/en/data_types/int_uint/#diapazony-uint). **Example** Here is the example of test data from the table [ontime](../../getting-started/example-datasets/ontime.md). -Input table: +Table: ```sql SELECT count() FROM ontime @@ -1295,9 +1319,7 @@ SELECT count() FROM ontime The table has indexes on the fields `(FlightDate, (Year, FlightDate))`. -Create a query, where the index is not used. - -Query: +Create a query which does not use the index: ```sql SELECT FlightDate AS k, count() FROM ontime GROUP BY k ORDER BY k @@ -1318,15 +1340,13 @@ Result: └────────────┴─────────┘ ``` -To apply the index, select a specific date. - -Query: +To apply the index, select a specific date: ```sql SELECT FlightDate AS k, count() FROM ontime WHERE k = '2017-09-15' GROUP BY k ORDER BY k ``` -By using the index, ClickHouse processed a significantly smaller number of rows (`Processed 32.74 thousand rows`). +ClickHouse now uses the index to process a significantly smaller number of rows (`Processed 32.74 thousand rows`). Result: @@ -1336,7 +1356,7 @@ Result: └────────────┴─────────┘ ``` -Now wrap the expression `k = '2017-09-15'` into `indexHint` function. +Now wrap the expression `k = '2017-09-15'` in function `indexHint`: Query: @@ -1350,9 +1370,9 @@ GROUP BY k ORDER BY k ASC ``` -ClickHouse used the index in the same way as the previous time (`Processed 32.74 thousand rows`). +ClickHouse used the index the same way as previously (`Processed 32.74 thousand rows`). The expression `k = '2017-09-15'` was not used when generating the result. -In examle the `indexHint` function allows to see adjacent dates. +In example, the `indexHint` function allows to see adjacent dates. Result: @@ -1369,7 +1389,7 @@ Result: Creates an array with a single value. -Used for internal implementation of [arrayJoin](../../sql-reference/functions/array-join.md#functions_arrayjoin). +Used for the internal implementation of [arrayJoin](../../sql-reference/functions/array-join.md#functions_arrayjoin). ``` sql SELECT replicate(x, arr); @@ -1377,12 +1397,12 @@ SELECT replicate(x, arr); **Arguments:** -- `arr` — Original array. ClickHouse creates a new array of the same length as the original and fills it with the value `x`. -- `x` — The value that the resulting array will be filled with. +- `arr` — An array. +- `x` — The value to fill the result array with. **Returned value** -An array filled with the value `x`. +An array of the lame length as `arr` filled with value `x`. Type: `Array`. @@ -1404,7 +1424,7 @@ Result: ## filesystemAvailable -Returns amount of remaining space on the filesystem where the files of the databases located. It is always smaller than total free space ([filesystemFree](#filesystemfree)) because some space is reserved for OS. +Returns the amount of free space in the filesystem hosting the database persistence. The returned value is always smaller than total free space ([filesystemFree](#filesystemfree)) because some space is reserved for the operating system. **Syntax** @@ -1423,20 +1443,20 @@ Type: [UInt64](../../sql-reference/data-types/int-uint.md). Query: ``` sql -SELECT formatReadableSize(filesystemAvailable()) AS "Available space", toTypeName(filesystemAvailable()) AS "Type"; +SELECT formatReadableSize(filesystemAvailable()) AS "Available space"; ``` Result: ``` text -┌─Available space─┬─Type───┐ -│ 30.75 GiB │ UInt64 │ -└─────────────────┴────────┘ +┌─Available space─┐ +│ 30.75 GiB │ +└─────────────────┘ ``` ## filesystemFree -Returns total amount of the free space on the filesystem where the files of the databases located. See also `filesystemAvailable` +Returns the total amount of the free space on the filesystem hosting the database persistence. See also `filesystemAvailable` **Syntax** @@ -1446,7 +1466,7 @@ filesystemFree() **Returned value** -- Amount of free space in bytes. +- The amount of free space in bytes. Type: [UInt64](../../sql-reference/data-types/int-uint.md). @@ -1455,20 +1475,20 @@ Type: [UInt64](../../sql-reference/data-types/int-uint.md). Query: ``` sql -SELECT formatReadableSize(filesystemFree()) AS "Free space", toTypeName(filesystemFree()) AS "Type"; +SELECT formatReadableSize(filesystemFree()) AS "Free space"; ``` Result: ``` text -┌─Free space─┬─Type───┐ -│ 32.39 GiB │ UInt64 │ -└────────────┴────────┘ +┌─Free space─┐ +│ 32.39 GiB │ +└────────────┘ ``` ## filesystemCapacity -Returns the capacity of the filesystem in bytes. For evaluation, the [path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-path) to the data directory must be configured. +Returns the capacity of the filesystem in bytes. Needs the [path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-path) to the data directory to be configured. **Syntax** @@ -1478,7 +1498,7 @@ filesystemCapacity() **Returned value** -- Capacity information of the filesystem in bytes. +- Capacity of the filesystem in bytes. Type: [UInt64](../../sql-reference/data-types/int-uint.md). @@ -1487,20 +1507,20 @@ Type: [UInt64](../../sql-reference/data-types/int-uint.md). Query: ``` sql -SELECT formatReadableSize(filesystemCapacity()) AS "Capacity", toTypeName(filesystemCapacity()) AS "Type" +SELECT formatReadableSize(filesystemCapacity()) AS "Capacity"; ``` Result: ``` text -┌─Capacity──┬─Type───┐ -│ 39.32 GiB │ UInt64 │ -└───────────┴────────┘ +┌─Capacity──┐ +│ 39.32 GiB │ +└───────────┘ ``` ## initializeAggregation -Calculates result of aggregate function based on single value. It is intended to use this function to initialize aggregate functions with combinator [-State](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-state). You can create states of aggregate functions and insert them to columns of type [AggregateFunction](../../sql-reference/data-types/aggregatefunction.md#data-type-aggregatefunction) or use initialized aggregates as default values. +Calculates the result of an aggregate function based on a single value. This function can be used to initialize aggregate functions with combinator [-State](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-state). You can create states of aggregate functions and insert them to columns of type [AggregateFunction](../../sql-reference/data-types/aggregatefunction.md#data-type-aggregatefunction) or use initialized aggregates as default values. **Syntax** @@ -1539,6 +1559,7 @@ Query: ```sql SELECT finalizeAggregation(state), toTypeName(state) FROM (SELECT initializeAggregation('sumState', number % 3) AS state FROM numbers(5)); ``` + Result: ```text @@ -1568,11 +1589,12 @@ INSERT INTO metrics VALUES (0, initializeAggregation('sumState', toUInt64(42))) ``` **See Also** + - [arrayReduce](../../sql-reference/functions/array-functions.md#arrayreduce) ## finalizeAggregation -Takes state of aggregate function. Returns result of aggregation (or finalized state when using[-State](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-state) combinator). +Given a state of aggregate function, this function returns the result of aggregation (or finalized state when using a [-State](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-state) combinator). **Syntax** @@ -1667,15 +1689,16 @@ Result: ``` **See Also** + - [arrayReduce](../../sql-reference/functions/array-functions.md#arrayreduce) - [initializeAggregation](#initializeaggregation) ## runningAccumulate -Accumulates states of an aggregate function for each row of a data block. +Accumulates the states of an aggregate function for each row of a data block. -:::tip -The state is reset for each new data block. +:::note +The state is reset for each new block of data. ::: **Syntax** @@ -1726,10 +1749,10 @@ The subquery generates `sumState` for every number from `0` to `9`. `sumState` r The whole query does the following: -1. For the first row, `runningAccumulate` takes `sumState(0)` and returns `0`. -2. For the second row, the function merges `sumState(0)` and `sumState(1)` resulting in `sumState(0 + 1)`, and returns `1` as a result. -3. For the third row, the function merges `sumState(0 + 1)` and `sumState(2)` resulting in `sumState(0 + 1 + 2)`, and returns `3` as a result. -4. The actions are repeated until the block ends. +1. For the first row, `runningAccumulate` takes `sumState(0)` and returns `0`. +2. For the second row, the function merges `sumState(0)` and `sumState(1)` resulting in `sumState(0 + 1)`, and returns `1` as a result. +3. For the third row, the function merges `sumState(0 + 1)` and `sumState(2)` resulting in `sumState(0 + 1 + 2)`, and returns `3` as a result. +4. The actions are repeated until the block ends. The following example shows the `groupping` parameter usage: @@ -1780,7 +1803,7 @@ As you can see, `runningAccumulate` merges states for each group of rows separat The function lets you extract data from the table the same way as from a [dictionary](../../sql-reference/dictionaries/index.md). -Gets data from [Join](../../engines/table-engines/special/join.md#creating-a-table) tables using the specified join key. +Gets the data from [Join](../../engines/table-engines/special/join.md#creating-a-table) tables using the specified join key. Only supports tables created with the `ENGINE = Join(ANY, LEFT, )` statement. @@ -1792,13 +1815,13 @@ joinGet(join_storage_table_name, `value_column`, join_keys) **Arguments** -- `join_storage_table_name` — an [identifier](../../sql-reference/syntax.md#syntax-identifiers) indicates where search is performed. The identifier is searched in the default database (see parameter `default_database` in the config file). To override the default database, use the `USE db_name` or specify the database and the table through the separator `db_name.db_table`, see the example. +- `join_storage_table_name` — an [identifier](../../sql-reference/syntax.md#syntax-identifiers) indicating where the search is performed. The identifier is searched in the default database (see setting `default_database` in the config file). To override the default database, use `USE db_name` or specify the database and the table through the separator `db_name.db_table` as in the example. - `value_column` — name of the column of the table that contains required data. - `join_keys` — list of keys. **Returned value** -Returns list of values corresponded to list of keys. +Returns a list of values corresponded to list of keys. If certain does not exist in source table then `0` or `null` will be returned based on [join_use_nulls](../../operations/settings/settings.md#join_use_nulls) setting. @@ -1825,7 +1848,7 @@ INSERT INTO db_test.id_val VALUES (1,11)(2,12)(4,13) Query: ``` sql -SELECT joinGet(db_test.id_val,'val',toUInt32(number)) from numbers(4) SETTINGS join_use_nulls = 1 +SELECT joinGet(db_test.id_val, 'val', toUInt32(number)) from numbers(4) SETTINGS join_use_nulls = 1 ``` Result: @@ -1845,7 +1868,7 @@ Result: This function is not available in ClickHouse Cloud. ::: -Evaluate external catboost model. [CatBoost](https://catboost.ai) is an open-source gradient boosting library developed by Yandex for machine learing. +Evaluate an external catboost model. [CatBoost](https://catboost.ai) is an open-source gradient boosting library developed by Yandex for machine learing. Accepts a path to a catboost model and model arguments (features). Returns Float64. ``` sql @@ -1886,16 +1909,24 @@ See [Training and applying models](https://catboost.ai/docs/features/training.ht ## throwIf(x\[, message\[, error_code\]\]) -Throw an exception if the argument is non zero. -`message` - is an optional parameter: a constant string providing a custom error message -`error_code` - is an optional parameter: a constant integer providing a custom error code +Throw an exception if argument `x` is true. + +**Arguments** + +- `x` - the condition to check. +- `message` - a constant string providing a custom error message. Optional. +- `error_code` - A constant integer providing a custom error code. Optional. To use the `error_code` argument, configuration parameter `allow_custom_error_code_in_throwif` must be enabled. +**Example** + ``` sql SELECT throwIf(number = 3, 'Too many') FROM numbers(10); ``` +Result: + ``` text ↙ Progress: 0.00 rows, 0.00 B (0.00 rows/s., 0.00 B/s.) Received exception from server (version 19.14.1): Code: 395. DB::Exception: Received from localhost:9000. DB::Exception: Too many. @@ -1903,7 +1934,7 @@ Code: 395. DB::Exception: Received from localhost:9000. DB::Exception: Too many. ## identity -Returns the same value that was used as its argument. Used for debugging and testing, allows to cancel using index, and get the query performance of a full scan. When query is analyzed for possible use of index, the analyzer does not look inside `identity` functions. Also constant folding is not applied too. +Returns its argument. Intended for debugging and testing. Allows to cancel using index, and get the query performance of a full scan. When the query is analyzed for possible use of an index, the analyzer ignores everything in `identity` functions. Also disables constant folding. **Syntax** @@ -1916,7 +1947,7 @@ identity(x) Query: ``` sql -SELECT identity(42) +SELECT identity(42); ``` Result: @@ -1927,164 +1958,6 @@ Result: └──────────────┘ ``` -## randomPrintableASCII - -Generates a string with a random set of [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters) printable characters. - -**Syntax** - -``` sql -randomPrintableASCII(length) -``` - -**Arguments** - -- `length` — Resulting string length. Positive integer. - - If you pass `length < 0`, behavior of the function is undefined. - -**Returned value** - -- String with a random set of [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters) printable characters. - -Type: [String](../../sql-reference/data-types/string.md) - -**Example** - -``` sql -SELECT number, randomPrintableASCII(30) as str, length(str) FROM system.numbers LIMIT 3 -``` - -``` text -┌─number─┬─str────────────────────────────┬─length(randomPrintableASCII(30))─┐ -│ 0 │ SuiCOSTvC0csfABSw=UcSzp2.`rv8x │ 30 │ -│ 1 │ 1Ag NlJ &RCN:*>HVPG;PE-nO"SUFD │ 30 │ -│ 2 │ /"+<"wUTh:=LjJ Vm!c&hI*m#XTfzz │ 30 │ -└────────┴────────────────────────────────┴──────────────────────────────────┘ -``` - -## randomString - -Generates a binary string of the specified length filled with random bytes (including zero bytes). - -**Syntax** - -``` sql -randomString(length) -``` - -**Arguments** - -- `length` — String length. Positive integer. - -**Returned value** - -- String filled with random bytes. - -Type: [String](../../sql-reference/data-types/string.md). - -**Example** - -Query: - -``` sql -SELECT randomString(30) AS str, length(str) AS len FROM numbers(2) FORMAT Vertical; -``` - -Result: - -``` text -Row 1: -────── -str: 3 G : pT ?w тi k aV f6 -len: 30 - -Row 2: -────── -str: 9 ,] ^ ) ]?? 8 -len: 30 -``` - -**See Also** - -- [generateRandom](../../sql-reference/table-functions/generate.md#generaterandom) -- [randomPrintableASCII](../../sql-reference/functions/other-functions.md#randomascii) - - -## randomFixedString - -Generates a binary string of the specified length filled with random bytes (including zero bytes). - -**Syntax** - -``` sql -randomFixedString(length); -``` - -**Arguments** - -- `length` — String length in bytes. [UInt64](../../sql-reference/data-types/int-uint.md). - -**Returned value(s)** - -- String filled with random bytes. - -Type: [FixedString](../../sql-reference/data-types/fixedstring.md). - -**Example** - -Query: - -```sql -SELECT randomFixedString(13) as rnd, toTypeName(rnd) -``` - -Result: - -```text -┌─rnd──────┬─toTypeName(randomFixedString(13))─┐ -│ j▒h㋖HɨZ'▒ │ FixedString(13) │ -└──────────┴───────────────────────────────────┘ - -``` - -## randomStringUTF8 - -Generates a random string of a specified length. Result string contains valid UTF-8 code points. The value of code points may be outside of the range of assigned Unicode. - -**Syntax** - -``` sql -randomStringUTF8(length); -``` - -**Arguments** - -- `length` — Required length of the resulting string in code points. [UInt64](../../sql-reference/data-types/int-uint.md). - -**Returned value(s)** - -- UTF-8 random string. - -Type: [String](../../sql-reference/data-types/string.md). - -**Example** - -Query: - -```sql -SELECT randomStringUTF8(13) -``` - -Result: - -```text -┌─randomStringUTF8(13)─┐ -│ 𘤗𙉝д兠庇󡅴󱱎󦐪􂕌𔊹𓰛 │ -└──────────────────────┘ - -``` - ## getSetting Returns the current value of a [custom setting](../../operations/settings/index.md#custom_settings). @@ -2101,7 +1974,7 @@ getSetting('custom_setting'); **Returned value** -- The setting current value. +- The setting's current value. **Example** @@ -2110,7 +1983,7 @@ SET custom_a = 123; SELECT getSetting('custom_a'); ``` -**Result** +Result: ``` 123 @@ -2122,7 +1995,7 @@ SELECT getSetting('custom_a'); ## isDecimalOverflow -Checks whether the [Decimal](../../sql-reference/data-types/decimal.md) value is out of its (or specified) precision. +Checks whether the [Decimal](../../sql-reference/data-types/decimal.md) value is outside its precision or outside the specified precision. **Syntax** @@ -2133,11 +2006,11 @@ isDecimalOverflow(d, [p]) **Arguments** - `d` — value. [Decimal](../../sql-reference/data-types/decimal.md). -- `p` — precision. Optional. If omitted, the initial precision of the first argument is used. Using of this paratemer could be helpful for data extraction to another DBMS or file. [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). +- `p` — precision. Optional. If omitted, the initial precision of the first argument is used. This paratemer can be helpful to migrate data from/to another database or file. [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). **Returned values** -- `1` — Decimal value has more digits then it's precision allow, +- `1` — Decimal value has more digits then allowed by its precision, - `0` — Decimal value satisfies the specified precision. **Example** @@ -2159,7 +2032,7 @@ Result: ## countDigits -Returns number of decimal digits you need to represent the value. +Returns number of decimal digits need to represent a value. **Syntax** @@ -2199,9 +2072,7 @@ Result: ## errorCodeToName -**Returned value** - -- Variable name for the error code. +Returns the textual name of an error code. Type: [LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md). @@ -2220,7 +2091,7 @@ UNSUPPORTED_METHOD ## tcpPort Returns [native interface](../../interfaces/tcp.md) TCP port number listened by this server. -If it is executed in the context of a distributed table, then it generates a normal column, otherwise it produces a constant value. +If executed in the context of a distributed table, this function generates a normal column with values relevant to each shard. Otherwise it produces a constant value. **Syntax** @@ -2310,7 +2181,7 @@ Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-refere ## currentRoles -Returns the names of the roles which are current for the current user. The current roles can be changed by the [SET ROLE](../../sql-reference/statements/set-role.md#set-role-statement) statement. If the `SET ROLE` statement was not used, the function `currentRoles` returns the same as `defaultRoles`. +Returns the roles assigned to the current user. The roles can be changed by the [SET ROLE](../../sql-reference/statements/set-role.md#set-role-statement) statement. If no `SET ROLE` statement was not, the function `currentRoles` returns the same as `defaultRoles`. **Syntax** @@ -2320,7 +2191,7 @@ currentRoles() **Returned value** -- List of the current roles for the current user. +- A list of the current roles for the current user. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). @@ -2342,7 +2213,7 @@ Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-refere ## defaultRoles -Returns the names of the roles which are enabled by default for the current user when he logins. Initially these are all roles granted to the current user (see [GRANT](../../sql-reference/statements/grant.md#grant-select)), but that can be changed with the [SET DEFAULT ROLE](../../sql-reference/statements/set-role.md#set-default-role-statement) statement. +Returns the roles which are enabled by default for the current user when he logs in. Initially these are all roles granted to the current user (see [GRANT](../../sql-reference/statements/grant.md#grant-select)), but that can be changed with the [SET DEFAULT ROLE](../../sql-reference/statements/set-role.md#set-default-role-statement) statement. **Syntax** @@ -2358,7 +2229,7 @@ Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-refere ## getServerPort -Returns the number of the server port. When the port is not used by the server, throws an exception. +Returns the server port number. When the port is not used by the server, throws an exception. **Syntax** @@ -2407,7 +2278,7 @@ Result: Returns the ID of the current query. Other parameters of a query can be extracted from the [system.query_log](../../operations/system-tables/query_log.md) table via `query_id`. -In contrast to [initialQueryID](#initial-query-id) function, `queryID` can return different results on different shards (see example). +In contrast to [initialQueryID](#initial-query-id) function, `queryID` can return different results on different shards (see the example). **Syntax** @@ -2477,7 +2348,7 @@ Result: ## shardNum -Returns the index of a shard which processes a part of data for a distributed query. Indices are started from `1`. +Returns the index of a shard which processes a part of data in a distributed query. Indices are started from `1`. If a query is not distributed then constant value `0` is returned. **Syntax** diff --git a/docs/en/sql-reference/functions/random-functions.md b/docs/en/sql-reference/functions/random-functions.md index e90d537fb74..63d5174b494 100644 --- a/docs/en/sql-reference/functions/random-functions.md +++ b/docs/en/sql-reference/functions/random-functions.md @@ -31,7 +31,7 @@ Uses a linear congruential generator. ## randCanonical -Returns a Float64 value, evenly distributed in [0, 1). +Returns a random Float64 value, evenly distributed in interval [0, 1). ## randConstant @@ -54,11 +54,9 @@ Result: └────────────┴────────────┴──────────────┴────────────────┴─────────────────┴──────────────────────┘ ``` -# Functions for Generating Random Numbers based on a Distribution - ## randUniform -Returns a Float64 drawn uniformly from the interval between `min` and `max` ([continuous uniform distribution](https://en.wikipedia.org/wiki/Continuous_uniform_distribution)). +Returns a random Float64 drawn uniformly from interval [`min`, `max`) ([continuous uniform distribution](https://en.wikipedia.org/wiki/Continuous_uniform_distribution)). **Syntax** @@ -68,8 +66,8 @@ randUniform(min, max) **Arguments** -- `min` - `Float64` - min value of the range, -- `max` - `Float64` - max value of the range. +- `min` - `Float64` - left boundary of the range, +- `max` - `Float64` - reight boundary of the range. **Returned value** @@ -97,7 +95,7 @@ Result: ## randNormal -Returns a Float64 drawn from a [normal distribution](https://en.wikipedia.org/wiki/Normal_distribution). +Returns a random Float64 drawn from a [normal distribution](https://en.wikipedia.org/wiki/Normal_distribution). **Syntax** @@ -108,7 +106,7 @@ randNormal(mean, variance) **Arguments** - `mean` - `Float64` - mean value of distribution, -- `variance` - `Float64` - [variance](https://en.wikipedia.org/wiki/Variance). +- `variance` - `Float64` - [variance](https://en.wikipedia.org/wiki/Variance) of the distribution. **Returned value** @@ -136,7 +134,7 @@ Result: ## randLogNormal -Returns a Float64 drawn from a [log-normal distribution](https://en.wikipedia.org/wiki/Log-normal_distribution). +Returns a random Float64 drawn from a [log-normal distribution](https://en.wikipedia.org/wiki/Log-normal_distribution). **Syntax** @@ -147,7 +145,7 @@ randLogNormal(mean, variance) **Arguments** - `mean` - `Float64` - mean value of distribution, -- `variance` - `Float64` - [variance](https://en.wikipedia.org/wiki/Variance). +- `variance` - `Float64` - [variance](https://en.wikipedia.org/wiki/Variance) of the distribution. **Returned value** @@ -175,7 +173,7 @@ Result: ## randBinomial -Returns a UInt64 drawn from a [binomial distribution](https://en.wikipedia.org/wiki/Binomial_distribution). +Returns a random UInt64 drawn from a [binomial distribution](https://en.wikipedia.org/wiki/Binomial_distribution). **Syntax** @@ -186,7 +184,7 @@ randBinomial(experiments, probability) **Arguments** - `experiments` - `UInt64` - number of experiments, -- `probability` - `Float64` - probability of success in each experiment (values in `0...1` range only). +- `probability` - `Float64` - probability of success in each experiment, a value between 0 and 1. **Returned value** @@ -214,7 +212,7 @@ Result: ## randNegativeBinomial -Returns a UInt64 drawn from a [negative binomial distribution](https://en.wikipedia.org/wiki/Negative_binomial_distribution). +Returns a random UInt64 drawn from a [negative binomial distribution](https://en.wikipedia.org/wiki/Negative_binomial_distribution). **Syntax** @@ -225,7 +223,7 @@ randNegativeBinomial(experiments, probability) **Arguments** - `experiments` - `UInt64` - number of experiments, -- `probability` - `Float64` - probability of failure in each experiment (values in `0...1` range only). +- `probability` - `Float64` - probability of failure in each experiment, a value between 0 and 1. **Returned value** @@ -253,7 +251,7 @@ Result: ## randPoisson -Returns a UInt64 drawn from a [Poisson distribution](https://en.wikipedia.org/wiki/Poisson_distribution). +Returns a random UInt64 drawn from a [Poisson distribution](https://en.wikipedia.org/wiki/Poisson_distribution). **Syntax** @@ -291,7 +289,7 @@ Result: ## randBernoulli -Returns a UInt64 drawn from a [Bernoulli distribution](https://en.wikipedia.org/wiki/Bernoulli_distribution). +Returns a random UInt64 drawn from a [Bernoulli distribution](https://en.wikipedia.org/wiki/Bernoulli_distribution). **Syntax** @@ -301,7 +299,7 @@ randBernoulli(probability) **Arguments** -- `probability` - `Float64` - probability of success (values in `0...1` range only). +- `probability` - `Float64` - probability of success, a value between 0 and 1. **Returned value** @@ -329,7 +327,7 @@ Result: ## randExponential -Returns a Float64 drawn from a [exponential distribution](https://en.wikipedia.org/wiki/Exponential_distribution). +Returns a random Float64 drawn from a [exponential distribution](https://en.wikipedia.org/wiki/Exponential_distribution). **Syntax** @@ -367,7 +365,7 @@ Result: ## randChiSquared -Returns a Float64 drawn from a [Chi-square distribution](https://en.wikipedia.org/wiki/Chi-squared_distribution) - a distribution of a sum of the squares of k independent standard normal random variables. +Returns a random Float64 drawn from a [Chi-square distribution](https://en.wikipedia.org/wiki/Chi-squared_distribution) - a distribution of a sum of the squares of k independent standard normal random variables. **Syntax** @@ -405,7 +403,7 @@ Result: ## randStudentT -Returns a Float64 drawn from a [Student's t-distribution](https://en.wikipedia.org/wiki/Student%27s_t-distribution). +Returns a random Float64 drawn from a [Student's t-distribution](https://en.wikipedia.org/wiki/Student%27s_t-distribution). **Syntax** @@ -443,7 +441,7 @@ Result: ## randFisherF -Returns a Float64 drawn from a [F-distribution](https://en.wikipedia.org/wiki/F-distribution). +Returns a random Float64 drawn from a [F-distribution](https://en.wikipedia.org/wiki/F-distribution). **Syntax** @@ -480,47 +478,160 @@ Result: └─────────────────────┘ ``` -# Functions for Generating Random Strings - ## randomString -Returns a random String of specified `length`. Not all characters may be printable. +Generates a string of the specified length filled with random bytes (including zero bytes). Not all characters may be printable. **Syntax** -```sql +``` sql randomString(length) ``` +**Arguments** + +- `length` — String length in bytes. Positive integer. + +**Returned value** + +- String filled with random bytes. + +Type: [String](../../sql-reference/data-types/string.md). + +**Example** + +Query: + +``` sql +SELECT randomString(30) AS str, length(str) AS len FROM numbers(2) FORMAT Vertical; +``` + +Result: + +``` text +Row 1: +────── +str: 3 G : pT ?w тi k aV f6 +len: 30 + +Row 2: +────── +str: 9 ,] ^ ) ]?? 8 +len: 30 +``` + ## randomFixedString -Like `randomString` but returns a FixedString. - -## randomPrintableASCII - -Returns a random String of specified `length`. All characters are printable. +Generates a binary string of the specified length filled with random bytes (including zero bytes). Not all characters may be printable. **Syntax** +``` sql +randomFixedString(length); +``` + +**Arguments** + +- `length` — String length in bytes. [UInt64](../../sql-reference/data-types/int-uint.md). + +**Returned value(s)** + +- String filled with random bytes. + +Type: [FixedString](../../sql-reference/data-types/fixedstring.md). + +**Example** + +Query: + ```sql +SELECT randomFixedString(13) as rnd, toTypeName(rnd) +``` + +Result: + +```text +┌─rnd──────┬─toTypeName(randomFixedString(13))─┐ +│ j▒h㋖HɨZ'▒ │ FixedString(13) │ +└──────────┴───────────────────────────────────┘ +``` + +## randomPrintableASCII + +Generates a string with a random set of [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters) characters. All characters are printable. +If you pass `length < 0`, the behavior of the function is undefined. + +**Syntax** + +``` sql randomPrintableASCII(length) ``` +**Arguments** + +- `length` — String length in bytes. Positive integer. + +**Returned value** + +- String with a random set of [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters) printable characters. + +Type: [String](../../sql-reference/data-types/string.md) + +**Example** + +``` sql +SELECT number, randomPrintableASCII(30) as str, length(str) FROM system.numbers LIMIT 3 +``` + +``` text +┌─number─┬─str────────────────────────────┬─length(randomPrintableASCII(30))─┐ +│ 0 │ SuiCOSTvC0csfABSw=UcSzp2.`rv8x │ 30 │ +│ 1 │ 1Ag NlJ &RCN:*>HVPG;PE-nO"SUFD │ 30 │ +│ 2 │ /"+<"wUTh:=LjJ Vm!c&hI*m#XTfzz │ 30 │ +└────────┴────────────────────────────────┴──────────────────────────────────┘ +``` + ## randomStringUTF8 -Returns a random String containing `length` many UTF8 codepoints. Not all characters may be printable +Generates a random string of a specified length. Result string contains valid UTF-8 code points. The value of code points may be outside of the range of assigned Unicode. **Syntax** +``` sql +randomStringUTF8(length); +``` + +**Arguments** + +- `length` — Length of the string in code points. [UInt64](../../sql-reference/data-types/int-uint.md). + +**Returned value(s)** + +- UTF-8 random string. + +Type: [String](../../sql-reference/data-types/string.md). + +**Example** + +Query: + ```sql -randomStringUTF8(length) +SELECT randomStringUTF8(13) +``` + +Result: + +```text +┌─randomStringUTF8(13)─┐ +│ 𘤗𙉝д兠庇󡅴󱱎󦐪􂕌𔊹𓰛 │ +└──────────────────────┘ ``` ## fuzzBits **Syntax** -Inverts the bits of String or FixedString `s`, each with probability `prob`. +Flips the bits of String or FixedString `s`, each with probability `prob`. **Syntax** @@ -529,8 +640,8 @@ fuzzBits(s, prob) ``` **Arguments** -- `s` - `String` or `FixedString` -- `prob` - constant `Float32/64` +- `s` - `String` or `FixedString`, +- `prob` - constant `Float32/64` between 0.0 and 1.0. **Returned value** From 0181ea63993bc6dd9c924c092bc7294a1bf71e2f Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 2 Jun 2023 12:55:46 +0300 Subject: [PATCH 0295/1072] Revert "make filter push down through cross join" --- .../Optimizations/filterPushDown.cpp | 6 +++--- .../01763_filter_push_down_bugs.reference | 19 ------------------- .../01763_filter_push_down_bugs.sql | 19 ------------------- 3 files changed, 3 insertions(+), 41 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index db29038999b..37bc894339f 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -272,7 +272,7 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes { /// If totals step has HAVING expression, skip it for now. /// TODO: - /// We can merge HAVING expression with current filter. + /// We can merge HAVING expression with current filer. /// Also, we can push down part of HAVING which depend only on aggregation keys. if (totals_having->getActions()) return 0; @@ -323,9 +323,9 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes { const auto & table_join = join ? join->getJoin()->getTableJoin() : filled_join->getJoin()->getTableJoin(); - /// Only inner, cross and left(/right) join are supported. Other types may generate default values for left table keys. + /// Only inner and left(/right) join are supported. Other types may generate default values for left table keys. /// So, if we push down a condition like `key != 0`, not all rows may be filtered. - if (table_join.kind() != JoinKind::Inner && table_join.kind() != JoinKind::Cross && table_join.kind() != kind) + if (table_join.kind() != JoinKind::Inner && table_join.kind() != kind) return 0; bool is_left = kind == JoinKind::Left; diff --git a/tests/queries/0_stateless/01763_filter_push_down_bugs.reference b/tests/queries/0_stateless/01763_filter_push_down_bugs.reference index 7df35e2948d..5aa2e645509 100644 --- a/tests/queries/0_stateless/01763_filter_push_down_bugs.reference +++ b/tests/queries/0_stateless/01763_filter_push_down_bugs.reference @@ -6,22 +6,3 @@ String1_0 String2_0 String3_0 String4_0 1 String1_0 String2_0 String3_0 String4_0 1 1 [0,1,2] 1 -Expression ((Projection + Before ORDER BY)) - Filter (WHERE) - Join (JOIN FillRightFirst) - Filter (( + Before JOIN)) - ReadFromMergeTree (default.t1) - Indexes: - PrimaryKey - Keys: - id - Condition: (id in [101, 101]) - Parts: 1/1 - Granules: 1/1 - Expression ((Joined actions + (Rename joined columns + (Projection + Before ORDER BY)))) - ReadFromMergeTree (default.t2) - Indexes: - PrimaryKey - Condition: true - Parts: 1/1 - Granules: 1/1 diff --git a/tests/queries/0_stateless/01763_filter_push_down_bugs.sql b/tests/queries/0_stateless/01763_filter_push_down_bugs.sql index 2ee249b5ce7..1058bf75144 100644 --- a/tests/queries/0_stateless/01763_filter_push_down_bugs.sql +++ b/tests/queries/0_stateless/01763_filter_push_down_bugs.sql @@ -38,25 +38,6 @@ DROP TABLE IF EXISTS Test; select x, y from (select [0, 1, 2] as y, 1 as a, 2 as b) array join y as x where a = 1 and b = 2 and (x = 1 or x != 1) and x = 1; -DROP TABLE IF EXISTS t; create table t(a UInt8) engine=MergeTree order by a; insert into t select * from numbers(2); select a from t t1 join t t2 on t1.a = t2.a where t1.a; -DROP TABLE IF EXISTS t; - -DROP TABLE IF EXISTS t1; -DROP TABLE IF EXISTS t2; -CREATE TABLE t1 (id Int64, create_time DateTime) ENGINE = MergeTree ORDER BY id; -CREATE TABLE t2 (delete_time DateTime) ENGINE = MergeTree ORDER BY delete_time; - -insert into t1 values (101, '2023-05-28 00:00:00'), (102, '2023-05-28 00:00:00'); -insert into t2 values ('2023-05-31 00:00:00'); - -EXPLAIN indexes=1 SELECT id, delete_time FROM t1 - CROSS JOIN ( - SELECT delete_time - FROM t2 -) AS d WHERE create_time < delete_time AND id = 101; - -DROP TABLE IF EXISTS t1; -DROP TABLE IF EXISTS t2; From 0ddd53088d48134cdbd55bcda2e6bc3bfad423de Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 31 May 2023 23:17:41 +0200 Subject: [PATCH 0296/1072] Add a new runner type for ci metrics and autoscaling --- tests/ci/autoscale_runners_lambda/app.py | 4 ++++ tests/ci/lambda_shared_package/lambda_shared/__init__.py | 1 + 2 files changed, 5 insertions(+) diff --git a/tests/ci/autoscale_runners_lambda/app.py b/tests/ci/autoscale_runners_lambda/app.py index ab09afb3aa8..825708cabbc 100644 --- a/tests/ci/autoscale_runners_lambda/app.py +++ b/tests/ci/autoscale_runners_lambda/app.py @@ -65,6 +65,10 @@ def get_scales(runner_type: str) -> Tuple[int, int]: # 10. I am trying 7 now. # UPDATE THE COMMENT ON CHANGES scale_up = 7 + elif runner_type == "limited-tester": + # The limited runners should inflate and deflate faster + scale_down = 1 + scale_up = 2 return scale_down, scale_up diff --git a/tests/ci/lambda_shared_package/lambda_shared/__init__.py b/tests/ci/lambda_shared_package/lambda_shared/__init__.py index 534d7773ddd..c56994cc86a 100644 --- a/tests/ci/lambda_shared_package/lambda_shared/__init__.py +++ b/tests/ci/lambda_shared_package/lambda_shared/__init__.py @@ -15,6 +15,7 @@ RUNNER_TYPE_LABELS = [ "func-tester", "func-tester-aarch64", "fuzzer-unit-tester", + "limited-tester", "stress-tester", "style-checker", "style-checker-aarch64", From b7c5fdab77c41361af7b5130256a2afdf2bc1488 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 31 May 2023 23:56:39 +0200 Subject: [PATCH 0297/1072] Move lambda package building to public.ecr.aws/lambda/python for compatibility --- tests/ci/lambda_shared_package/pyproject.toml | 5 +++-- tests/ci/team_keys_lambda/build_and_deploy_archive.sh | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/ci/lambda_shared_package/pyproject.toml b/tests/ci/lambda_shared_package/pyproject.toml index bbf74cc0649..dff36b89fbb 100644 --- a/tests/ci/lambda_shared_package/pyproject.toml +++ b/tests/ci/lambda_shared_package/pyproject.toml @@ -6,13 +6,14 @@ build-backend = "setuptools.build_meta" name = "lambda_shared" version = "0.0.1" dependencies = [ - "requests < 2.30", + "requests", + "urllib3 < 2" ] [project.optional-dependencies] token = [ "PyJWT", - "cryptography<38", + "cryptography", ] dev = [ "boto3", diff --git a/tests/ci/team_keys_lambda/build_and_deploy_archive.sh b/tests/ci/team_keys_lambda/build_and_deploy_archive.sh index 89a2d514965..02d5638cf18 100644 --- a/tests/ci/team_keys_lambda/build_and_deploy_archive.sh +++ b/tests/ci/team_keys_lambda/build_and_deploy_archive.sh @@ -12,7 +12,7 @@ DRY_RUN=${DRY_RUN:-} PY_VERSION=${PY_VERSION:-3.10} PY_EXEC="python${PY_VERSION}" # Image to build the lambda zip package -DOCKER_IMAGE="python:${PY_VERSION}-slim" +DOCKER_IMAGE="public.ecr.aws/lambda/python:${PY_VERSION}" # Rename the_lambda_name directory to the-lambda-name lambda in AWS LAMBDA_NAME=${DIR_NAME//_/-} # The name of directory with lambda code @@ -23,9 +23,9 @@ cp app.py "$PACKAGE" if [ -f requirements.txt ]; then VENV=lambda-venv rm -rf "$VENV" lambda-package.zip - docker run --rm --user="${UID}" -e HOME=/tmp \ + docker run --rm --user="${UID}" -e HOME=/tmp --entrypoint=/bin/bash \ --volume="${WORKDIR}/..:/ci" --workdir="/ci/${DIR_NAME}" "${DOCKER_IMAGE}" \ - /bin/bash -exc " + -exc " '$PY_EXEC' -m venv '$VENV' && source '$VENV/bin/activate' && pip install -r requirements.txt From e657e2ba10c425caa550de6e9b3814dca8fc3f32 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 1 Jun 2023 17:12:19 +0200 Subject: [PATCH 0298/1072] Additional logging in autoscale_runners_lambda --- tests/ci/autoscale_runners_lambda/app.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/ci/autoscale_runners_lambda/app.py b/tests/ci/autoscale_runners_lambda/app.py index 825708cabbc..bebfb594b59 100644 --- a/tests/ci/autoscale_runners_lambda/app.py +++ b/tests/ci/autoscale_runners_lambda/app.py @@ -117,7 +117,17 @@ def set_capacity( # Finally, should the capacity be even changed stop = stop or asg["DesiredCapacity"] == desired_capacity if stop: + logging.info( + "Do not increase ASG %s capacity, current capacity=%s, " + "maximum capacity=%s, running jobs=%s, queue size=%s", + asg["AutoScalingGroupName"], + desired_capacity, + asg["MaxSize"], + running, + queued, + ) return + logging.info( "The ASG %s capacity will be increased to %s, current capacity=%s, " "maximum capacity=%s, running jobs=%s, queue size=%s", @@ -142,6 +152,15 @@ def set_capacity( desired_capacity = min(desired_capacity, asg["MaxSize"]) stop = stop or asg["DesiredCapacity"] == desired_capacity if stop: + logging.info( + "Do not decrease ASG %s capacity, current capacity=%s, " + "minimum capacity=%s, running jobs=%s, queue size=%s", + asg["AutoScalingGroupName"], + desired_capacity, + asg["MinSize"], + running, + queued, + ) return logging.info( From b775b5cfd6eb924bc119ea80b08fde9a4bb42a1b Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 1 Jun 2023 17:44:57 +0200 Subject: [PATCH 0299/1072] Move all CI runners metrics into one namespace --- tests/ci/ci_runners_metrics_lambda/app.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/ci/ci_runners_metrics_lambda/app.py b/tests/ci/ci_runners_metrics_lambda/app.py index dc128dea739..d8b9489b1a7 100644 --- a/tests/ci/ci_runners_metrics_lambda/app.py +++ b/tests/ci/ci_runners_metrics_lambda/app.py @@ -171,18 +171,21 @@ def group_runners_by_tag( def push_metrics_to_cloudwatch( - listed_runners: RunnerDescriptions, namespace: str + listed_runners: RunnerDescriptions, group_name: str ) -> None: client = boto3.client("cloudwatch") + namespace = "RunnersMetrics" metrics_data = [] busy_runners = sum( 1 for runner in listed_runners if runner.busy and not runner.offline ) + dimensions = [{"Name": "group", "Value": group_name}] metrics_data.append( { "MetricName": "BusyRunners", "Value": busy_runners, "Unit": "Count", + "Dimensions": dimensions, } ) total_active_runners = sum(1 for runner in listed_runners if not runner.offline) @@ -191,6 +194,7 @@ def push_metrics_to_cloudwatch( "MetricName": "ActiveRunners", "Value": total_active_runners, "Unit": "Count", + "Dimensions": dimensions, } ) total_runners = len(listed_runners) @@ -199,6 +203,7 @@ def push_metrics_to_cloudwatch( "MetricName": "TotalRunners", "Value": total_runners, "Unit": "Count", + "Dimensions": dimensions, } ) if total_active_runners == 0: @@ -211,6 +216,7 @@ def push_metrics_to_cloudwatch( "MetricName": "BusyRunnersRatio", "Value": busy_ratio, "Unit": "Percent", + "Dimensions": dimensions, } ) @@ -242,7 +248,7 @@ def main( for group, group_runners in grouped_runners.items(): if push_to_cloudwatch: print(f"Pushing metrics for group '{group}'") - push_metrics_to_cloudwatch(group_runners, "RunnersMetrics/" + group) + push_metrics_to_cloudwatch(group_runners, group) else: print(group, f"({len(group_runners)})") for runner in group_runners: From 3c13eaa1592b5cf46d6aca8b3764aefc546eeff7 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 1 Jun 2023 22:03:42 +0200 Subject: [PATCH 0300/1072] Simplify get_dead_runners_in_ec2 a little bit, update runners --- tests/ci/ci_runners_metrics_lambda/app.py | 62 +++++++++++------------ 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/tests/ci/ci_runners_metrics_lambda/app.py b/tests/ci/ci_runners_metrics_lambda/app.py index d8b9489b1a7..9c38659269b 100644 --- a/tests/ci/ci_runners_metrics_lambda/app.py +++ b/tests/ci/ci_runners_metrics_lambda/app.py @@ -99,44 +99,44 @@ def get_dead_runners_in_ec2(runners: RunnerDescriptions) -> RunnerDescriptions: def get_lost_ec2_instances(runners: RunnerDescriptions) -> List[dict]: client = boto3.client("ec2") reservations = client.describe_instances( - Filters=[{"Name": "tag-key", "Values": ["github:runner-type"]}] + Filters=[{"Name": "tag-key", "Values": ["github:runner-type"]}], )["Reservations"] - lost_instances = [] - offline_runners = [ - runner.name for runner in runners if runner.offline and not runner.busy + # flatten the reservation into instances + instances = [ + instance + for reservation in reservations + for instance in reservation["Instances"] ] - # Here we refresh the runners to get the most recent state + lost_instances = [] + offline_runner_names = { + runner.name for runner in runners if runner.offline and not runner.busy + } + runner_names = {runner.name for runner in runners} now = datetime.now().timestamp() - for reservation in reservations: - for instance in reservation["Instances"]: - # Do not consider instances started 20 minutes ago as problematic - if now - instance["LaunchTime"].timestamp() < 1200: - continue + for instance in instances: + # Do not consider instances started 20 minutes ago as problematic + if now - instance["LaunchTime"].timestamp() < 1200: + continue - runner_type = [ - tag["Value"] - for tag in instance["Tags"] - if tag["Key"] == "github:runner-type" - ][0] - # If there's no necessary labels in runner type it's fine - if not ( - UNIVERSAL_LABEL in runner_type or runner_type in RUNNER_TYPE_LABELS - ): - continue + runner_type = [ + tag["Value"] + for tag in instance["Tags"] + if tag["Key"] == "github:runner-type" + ][0] + # If there's no necessary labels in runner type it's fine + if not (UNIVERSAL_LABEL in runner_type or runner_type in RUNNER_TYPE_LABELS): + continue - if instance["InstanceId"] in offline_runners: - lost_instances.append(instance) - continue + if instance["InstanceId"] in offline_runner_names: + lost_instances.append(instance) + continue - if instance["State"]["Name"] == "running" and ( - not [ - runner - for runner in runners - if runner.name == instance["InstanceId"] - ] - ): - lost_instances.append(instance) + if ( + instance["State"]["Name"] == "running" + and not instance["InstanceId"] in runner_names + ): + lost_instances.append(instance) return lost_instances From 9bd0a53e7c3ce6b1ad86bc09b134319a8563ff47 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 1 Jun 2023 22:28:02 +0200 Subject: [PATCH 0301/1072] Get only online instances in get_lost_ec2_instances --- tests/ci/ci_runners_metrics_lambda/app.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/ci/ci_runners_metrics_lambda/app.py b/tests/ci/ci_runners_metrics_lambda/app.py index 9c38659269b..71a644fe072 100644 --- a/tests/ci/ci_runners_metrics_lambda/app.py +++ b/tests/ci/ci_runners_metrics_lambda/app.py @@ -99,7 +99,10 @@ def get_dead_runners_in_ec2(runners: RunnerDescriptions) -> RunnerDescriptions: def get_lost_ec2_instances(runners: RunnerDescriptions) -> List[dict]: client = boto3.client("ec2") reservations = client.describe_instances( - Filters=[{"Name": "tag-key", "Values": ["github:runner-type"]}], + Filters=[ + {"Name": "tag-key", "Values": ["github:runner-type"]}, + {"Name": "instance-state-name", "Values": ["pending", "running"]}, + ], )["Reservations"] # flatten the reservation into instances instances = [ From 8cadd89ebedc2fee73c3081992d35bbf8ad3280c Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Fri, 2 Jun 2023 12:34:22 +0200 Subject: [PATCH 0302/1072] Update src/Common/TaskStatsInfoGetter.cpp --- src/Common/TaskStatsInfoGetter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/TaskStatsInfoGetter.cpp b/src/Common/TaskStatsInfoGetter.cpp index d21229609dd..867a50c8cce 100644 --- a/src/Common/TaskStatsInfoGetter.cpp +++ b/src/Common/TaskStatsInfoGetter.cpp @@ -208,7 +208,7 @@ bool checkPermissionsImpl() try { ::taskstats stats{}; - TaskStatsInfoGetter().getStat(stats, static_cast(getThreadId())); + TaskStatsInfoGetter().getStat(stats, static_cast(getThreadId())); } catch (const Exception & e) { From 79c14c89ee39bea6017e41e375f9b6a59e3e7899 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 2 Jun 2023 12:53:25 +0200 Subject: [PATCH 0303/1072] Add some assertions --- src/Interpreters/Cache/FileCache.cpp | 6 +++--- src/Interpreters/Cache/FileSegment.cpp | 18 +++++++++++++----- src/Interpreters/Cache/Metadata.cpp | 21 ++++++++++++--------- src/Interpreters/Cache/Metadata.h | 4 ++-- 4 files changed, 30 insertions(+), 19 deletions(-) diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp index 1908a4ce895..163e1b71ab9 100644 --- a/src/Interpreters/Cache/FileCache.cpp +++ b/src/Interpreters/Cache/FileCache.cpp @@ -48,12 +48,12 @@ const String & FileCache::getBasePath() const String FileCache::getPathInLocalCache(const Key & key, size_t offset, FileSegmentKind segment_kind) const { - return metadata.getPathInLocalCache(key, offset, segment_kind); + return metadata.getPathForFileSegment(key, offset, segment_kind); } String FileCache::getPathInLocalCache(const Key & key) const { - return metadata.getPathInLocalCache(key); + return metadata.getPathForKey(key); } void FileCache::assertInitialized() const @@ -1019,7 +1019,7 @@ std::vector FileCache::tryGetCachePaths(const Key & key) for (const auto & [offset, file_segment_metadata] : *locked_key->getKeyMetadata()) { if (file_segment_metadata->file_segment->state() == FileSegment::State::DOWNLOADED) - cache_paths.push_back(metadata.getPathInLocalCache(key, offset, file_segment_metadata->file_segment->getKind())); + cache_paths.push_back(metadata.getPathForFileSegment(key, offset, file_segment_metadata->file_segment->getKind())); } return cache_paths; } diff --git a/src/Interpreters/Cache/FileSegment.cpp b/src/Interpreters/Cache/FileSegment.cpp index 60228573666..fb0ba0eba14 100644 --- a/src/Interpreters/Cache/FileSegment.cpp +++ b/src/Interpreters/Cache/FileSegment.cpp @@ -313,6 +313,13 @@ void FileSegment::write(const char * from, size_t size, size_t offset) if (!size) throw Exception(ErrorCodes::LOGICAL_ERROR, "Writing zero size is not allowed"); + auto file_segment_path = getPathInLocalCache(); + if (offset == range().left && fs::exists(file_segment_path)) + { + fs::remove(file_segment_path); + chassert(false); + } + { auto lock = segment_guard.lock(); @@ -365,7 +372,7 @@ void FileSegment::write(const char * from, size_t size, size_t offset) downloaded_size += size; - chassert(std::filesystem::file_size(getPathInLocalCache()) == downloaded_size); + chassert(std::filesystem::file_size(file_segment_path) == downloaded_size); } catch (ErrnoException & e) { @@ -375,7 +382,7 @@ void FileSegment::write(const char * from, size_t size, size_t offset) int code = e.getErrno(); if (code == /* No space left on device */28 || code == /* Quota exceeded */122) { - const auto file_size = fs::file_size(getPathInLocalCache()); + const auto file_size = fs::file_size(file_segment_path); chassert(downloaded_size <= file_size); chassert(reserved_size >= file_size); if (downloaded_size != file_size) @@ -520,8 +527,8 @@ void FileSegment::setDownloadedUnlocked(const FileSegmentGuard::Lock &) remote_file_reader.reset(); } - chassert(getDownloadedSize(false) > 0); - chassert(fs::file_size(getPathInLocalCache()) > 0); + chassert(downloaded_size > 0); + chassert(fs::file_size(getPathInLocalCache()) == downloaded_size); } void FileSegment::setDownloadFailedUnlocked(const FileSegmentGuard::Lock & lock) @@ -845,7 +852,8 @@ void FileSegment::detach(const FileSegmentGuard::Lock & lock, const LockedKey &) if (download_state == State::DETACHED) return; - resetDownloaderUnlocked(lock); + if (!downloader_id.empty()) + resetDownloaderUnlocked(lock); setDetachedState(lock); } diff --git a/src/Interpreters/Cache/Metadata.cpp b/src/Interpreters/Cache/Metadata.cpp index 843ffd45b63..9dff77e2af8 100644 --- a/src/Interpreters/Cache/Metadata.cpp +++ b/src/Interpreters/Cache/Metadata.cpp @@ -145,15 +145,12 @@ String CacheMetadata::getFileNameForFileSegment(size_t offset, FileSegmentKind s return std::to_string(offset) + file_suffix; } -String CacheMetadata::getPathInLocalCache(const Key & key, size_t offset, FileSegmentKind segment_kind) const +String CacheMetadata::getPathForFileSegment(const Key & key, size_t offset, FileSegmentKind segment_kind) const { - String file_suffix; - - const auto key_str = key.toString(); - return fs::path(path) / key_str.substr(0, 3) / key_str / getFileNameForFileSegment(offset, segment_kind); + return fs::path(getPathForKey(key)) / getFileNameForFileSegment(offset, segment_kind); } -String CacheMetadata::getPathInLocalCache(const Key & key) const +String CacheMetadata::getPathForKey(const Key & key) const { const auto key_str = key.toString(); return fs::path(path) / key_str.substr(0, 3) / key_str; @@ -178,7 +175,7 @@ LockedKeyPtr CacheMetadata::lockKeyMetadata( it = emplace( key, std::make_shared( - key, getPathInLocalCache(key), *cleanup_queue, is_initial_load)).first; + key, getPathForKey(key), *cleanup_queue, is_initial_load)).first; } key_metadata = it->second; @@ -260,7 +257,7 @@ void CacheMetadata::doCleanup() erase(it); LOG_DEBUG(log, "Key {} is removed from metadata", cleanup_key); - const fs::path key_directory = getPathInLocalCache(cleanup_key); + const fs::path key_directory = getPathForKey(cleanup_key); const fs::path key_prefix_directory = key_directory.parent_path(); try @@ -370,8 +367,14 @@ KeyMetadata::iterator LockedKey::removeFileSegment(size_t offset, const FileSegm file_segment->queue_iterator->annul(); const auto path = key_metadata->getFileSegmentPath(*file_segment); - if (fs::exists(path)) + bool exists = fs::exists(path); + if (exists) + { + LOG_TEST(log, "Removed file segment at path: {}", path); fs::remove(path); + } + else if (file_segment->downloaded_size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected path {} to exist"); file_segment->detach(segment_lock, *this); return key_metadata->erase(it); diff --git a/src/Interpreters/Cache/Metadata.h b/src/Interpreters/Cache/Metadata.h index 2e015b07ed0..a7e101c3d9d 100644 --- a/src/Interpreters/Cache/Metadata.h +++ b/src/Interpreters/Cache/Metadata.h @@ -85,12 +85,12 @@ public: const String & getBaseDirectory() const { return path; } - String getPathInLocalCache( + String getPathForFileSegment( const Key & key, size_t offset, FileSegmentKind segment_kind) const; - String getPathInLocalCache(const Key & key) const; + String getPathForKey(const Key & key) const; static String getFileNameForFileSegment(size_t offset, FileSegmentKind segment_kind); void iterate(IterateCacheMetadataFunc && func); From 572f15b2cd70fa7b4293c7ec9682e361c9989d77 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 2 Jun 2023 11:14:52 +0000 Subject: [PATCH 0304/1072] Fix typo --- docs/en/sql-reference/functions/functions-for-nulls.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/functions-for-nulls.md b/docs/en/sql-reference/functions/functions-for-nulls.md index f4ca27e9b16..6f82fedaab7 100644 --- a/docs/en/sql-reference/functions/functions-for-nulls.md +++ b/docs/en/sql-reference/functions/functions-for-nulls.md @@ -245,7 +245,7 @@ assumeNotNull(x) **Returned values** - The input value as non-`Nullable` type, if it is not `NULL`. -- An arbirary value, if the input value is `NULL`. +- An arbitrary value, if the input value is `NULL`. **Example** From fa5f890a7ad43fde1cd75d6c07170a6df3ec119d Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Fri, 2 Jun 2023 12:03:50 +0000 Subject: [PATCH 0305/1072] Added ru function descriptions (docs) --- .../functions/type-conversion-functions.md | 2 +- .../functions/type-conversion-functions.md | 86 +++++++++++++++++++ src/Functions/DateTimeTransforms.h | 7 +- 3 files changed, 92 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index a6fc6cd4dfc..eb210863c32 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -383,7 +383,7 @@ toDateTimeOrDefault(expr, [, time_zone [, default_value]]) **Arguments** - `expr` — The value. [String](/docs/en/sql-reference/data-types/string.md), [Int](/docs/en/sql-reference/data-types/int-uint.md), [Date](/docs/en/sql-reference/data-types/date.md) or [DateTime](/docs/en/sql-reference/data-types/datetime.md). -- `time_zone` — Time zone. +- `time_zone` — Time zone. [String](/docs/en/sql-reference/data-types/string.md). - `default_value` — The default value. [DateTime](/docs/en/sql-reference/data-types/datetime.md) If `expr` is a number, it is interpreted as the number of seconds since the beginning of the Unix Epoch (as Unix timestamp). diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 298b7bbc93e..67d1732d34e 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -173,6 +173,49 @@ Cиноним: `DATE`. ## toDateOrDefault {#todateordefault} +Конвертирует аргумент в значение [Date](/docs/ru/sql-reference/data-types/date.md) data type. +Если получен недопустимый аргумент, то возвращает значение по умолчанию (нижняя граница [Date](/docs/ru/sql-reference/data-types/date.md). Значение по умолчанию может быть указано вторым аргументом. +Похожа на [toDate](#todate). + +**Синтаксис** + +``` sql +toDateOrDefault(expr [, default_value]) +``` + +**Аргументы** + +- `expr` — Значение для преобразования. [String](/docs/ru/sql-reference/data-types/string.md), [Int](/docs/ru/sql-reference/data-types/int-uint.md), [Date](/docs/ru/sql-reference/data-types/date.md) или [DateTime](/docs/ru/sql-reference/data-types/datetime.md). +- `default_value` — Значение по умолчанию. [Date](/docs/ru/sql-reference/data-types/date.md) + +Если `expr` является числом выглядит как UNIX timestamp (больше чем 65535), оно интерпретируется как DateTime, затем обрезается до Date учитывавая текущую часовой пояс. Если `expr` является числом и меньше чем 65536, оно интерпретируется как количество дней с 1970-01-01. + +**Возвращаемое значение** + +- Календарная дата. [Date](/docs/ru/sql-reference/data-types/date.md). + +**Пример** + +Запрос: + +``` sql +SELECT + toDateOrDefault('2021-01-01', '2023-01-01'::Date), + toDateOrDefault('xx2021-01-01', '2023-01-01'::Date); +``` + +Результат: + +```response +┌─toDateOrDefault('2021-01-01', CAST('2023-01-01', 'Date'))─┬─toDateOrDefault('xx2021-01-01', CAST('2023-01-01', 'Date'))─┐ +│ 2021-01-01 │ 2023-01-01 │ +└───────────────────────────────────────────────────────────┴─────────────────────────────────────────────────────────────┘ +``` + +**Смотрите также** +- [toDate](#todate) +- [toDate32OrDefault](#todate32ordefault) + ## toDateTime {#todatetime} ## toDateTimeOrZero {#todatetimeorzero} @@ -181,6 +224,49 @@ Cиноним: `DATE`. ## toDateTimeOrDefault {#todatetimeordefault} +Конвертирует аргумент в значение [DateTime](/docs/ru/sql-reference/data-types/datetime.md). +Если получен недопустимый аргумент, то возвращает значение по умолчанию (нижняя граница [DateTime](/docs/ru/sql-reference/data-types/datetime.md)). Значение по умолчанию может быть указано третьим аргументом. +Похожа на [toDateTime](#todatetime). + +**Синтаксис** + +``` sql +toDateTimeOrDefault(expr, [, time_zone [, default_value]]) +``` + +**Аргументы** + +- `expr` — Значение для преобразования. [String](/docs/ru/sql-reference/data-types/string.md), [Int](/docs/ru/sql-reference/data-types/int-uint.md), [Date](/docs/ru/sql-reference/data-types/date.md) или [DateTime](/docs/ru/sql-reference/data-types/datetime.md). +- `time_zone` — Часовой пояс. [String](/docs/ru/sql-reference/data-types/string.md). +- `default_value` — Значение по умолчанию. [DateTime](/docs/ru/sql-reference/data-types/datetime.md) + +Если `expr` является числом, оно интерпретируется как количество секунд от начала unix эпохи. + +**Возвращаемое значение** + +- Время. [DateTime](/docs/ru/sql-reference/data-types/datetime.md) + +**Пример** + +Запрос: + +``` sql +SELECT + toDateTimeOrDefault('2021-01-01', 'UTC', '2023-01-01'::DateTime('UTC')), + toDateTimeOrDefault('xx2021-01-01', 'UTC', '2023-01-01'::DateTime('UTC')); +``` + +Результат: + +```response +┌─toDateTimeOrDefault('2021-01-01', 'UTC', CAST('2023-01-01', 'DateTime(\'UTC\')'))─┬─toDateTimeOrDefault('xx2021-01-01', 'UTC', CAST('2023-01-01', 'DateTime(\'UTC\')'))─┐ +│ 2021-01-01 00:00:00 │ 2023-01-01 00:00:00 │ +└───────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────────────────────────────────────────────────┘ +``` + +**Смотрите также** +- [toDateTime](#todatetime) + ## toDate32 {#todate32} Конвертирует аргумент в значение типа [Date32](../../sql-reference/data-types/date32.md). Если значение выходит за границы диапазона, возвращается пограничное значение `Date32`. Если аргумент имеет тип [Date](../../sql-reference/data-types/date.md), учитываются границы типа `Date`. diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index 9f8f4df2465..d154dd9ffa2 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -1520,9 +1520,12 @@ struct DateTimeTransformImpl Op::vector(sources->getData(), col_to->getData(), time_zone, transform, vec_null_map_to); } - if (vec_null_map_to) + if constexpr (std::is_same_v) { - return ColumnNullable::create(std::move(mutable_result_col), std::move(col_null_map_to)); + if (vec_null_map_to) + { + return ColumnNullable::create(std::move(mutable_result_col), std::move(col_null_map_to)); + } } return mutable_result_col; From 73db383727550e040a939d367e6e59bb037780bd Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Fri, 2 Jun 2023 14:10:26 +0200 Subject: [PATCH 0306/1072] Limit the number of in-flight tasks for loading outdated parts (#50450) * Done * Update programs/local/LocalServer.cpp Co-authored-by: Alexander Tokmakov * Bump --------- Co-authored-by: Alexander Tokmakov --- programs/local/LocalServer.cpp | 2 +- programs/server/Server.cpp | 2 +- src/Core/ServerSettings.h | 1 - src/Interpreters/threadPoolCallbackRunner.h | 4 ++-- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 944a60d4e4c..96c1ca261b5 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -138,7 +138,7 @@ void LocalServer::initialize(Poco::Util::Application & self) OutdatedPartsLoadingThreadPool::initialize( config().getUInt("max_outdated_parts_loading_thread_pool_size", 16), 0, // We don't need any threads one all the parts will be loaded - config().getUInt("outdated_part_loading_thread_pool_queue_size", 10000)); + config().getUInt("max_outdated_parts_loading_thread_pool_size", 16)); } diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 03ada89e86f..9eb3e6c9ebc 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -696,7 +696,7 @@ try OutdatedPartsLoadingThreadPool::initialize( server_settings.max_outdated_parts_loading_thread_pool_size, 0, // We don't need any threads one all the parts will be loaded - server_settings.outdated_part_loading_thread_pool_queue_size); + server_settings.max_outdated_parts_loading_thread_pool_size); /// Initialize global local cache for remote filesystem. if (config().has("local_cache_for_remote_fs")) diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index 511b8d68f6d..cb43d62ecd1 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -22,7 +22,6 @@ namespace DB M(UInt64, max_io_thread_pool_free_size, 0, "Max free size for IO thread pool.", 0) \ M(UInt64, io_thread_pool_queue_size, 10000, "Queue size for IO thread pool.", 0) \ M(UInt64, max_outdated_parts_loading_thread_pool_size, 32, "The maximum number of threads that would be used for loading outdated data parts on startup", 0) \ - M(UInt64, outdated_part_loading_thread_pool_queue_size, 10000, "Queue size for parts loading thread pool.", 0) \ M(UInt64, max_replicated_fetches_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for replicated fetches. Zero means unlimited.", 0) \ M(UInt64, max_replicated_sends_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for replicated sends. Zero means unlimited.", 0) \ M(UInt64, max_remote_read_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for read. Zero means unlimited.", 0) \ diff --git a/src/Interpreters/threadPoolCallbackRunner.h b/src/Interpreters/threadPoolCallbackRunner.h index 55c6a848b77..f7324bfafe6 100644 --- a/src/Interpreters/threadPoolCallbackRunner.h +++ b/src/Interpreters/threadPoolCallbackRunner.h @@ -13,7 +13,7 @@ namespace DB template > using ThreadPoolCallbackRunner = std::function(Callback &&, Priority)>; -/// Creates CallbackRunner that runs every callback with 'pool->scheduleOrThrow()'. +/// Creates CallbackRunner that runs every callback with 'pool->scheduleOrThrowOnError()'. template > ThreadPoolCallbackRunner threadPoolCallbackRunner(ThreadPool & pool, const std::string & thread_name) { @@ -44,7 +44,7 @@ ThreadPoolCallbackRunner threadPoolCallbackRunner(ThreadPool & auto future = task->get_future(); - my_pool->scheduleOrThrow([my_task = std::move(task)]{ (*my_task)(); }, priority); + my_pool->scheduleOrThrowOnError([my_task = std::move(task)]{ (*my_task)(); }, priority); return future; }; From c1958c8bed68529c04c37a4b81d139088da3f2f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 2 Jun 2023 14:24:32 +0200 Subject: [PATCH 0307/1072] Remove 02764_parallel_replicas_plain_merge_tree from list of broken tests --- tests/broken_tests.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index 02935712325..96219323700 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -135,6 +135,5 @@ 02703_row_policy_for_database 02721_url_cluster 02534_s3_cluster_insert_select_schema_inference -02764_parallel_replicas_plain_merge_tree 02765_parallel_replicas_final_modifier From 65cc92a78d89f088c6c160dd4cb1748f48ed726d Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 2 Jun 2023 11:30:05 +0000 Subject: [PATCH 0308/1072] CI: Fix aspell on nested docs --- .../database-engines/materialized-mysql.md | 2 +- .../materialized-postgresql.md | 4 +- docs/en/engines/database-engines/sqlite.md | 2 +- .../table-engines/integrations/hdfs.md | 4 +- .../table-engines/integrations/hive.md | 2 +- .../table-engines/integrations/nats.md | 4 +- .../table-engines/integrations/postgresql.md | 2 +- .../engines/table-engines/integrations/s3.md | 6 +- .../mergetree-family/annindexes.md | 4 +- .../custom-partitioning-key.md | 2 +- .../mergetree-family/mergetree.md | 4 +- .../table-engines/special/executable.md | 4 +- .../table-engines/special/keepermap.md | 2 +- .../example-datasets/amazon-reviews.md | 4 +- .../example-datasets/cell-towers.md | 4 +- .../example-datasets/github.md | 4 +- .../example-datasets/opensky.md | 2 +- .../example-datasets/reddit-comments.md | 4 +- .../example-datasets/youtube-dislikes.md | 6 +- docs/en/operations/settings/settings.md | 2 +- docs/en/operations/system-tables/columns.md | 2 +- .../system-tables/dropped_tables.md | 2 +- .../system-tables/information_schema.md | 2 +- docs/en/operations/system-tables/licenses.md | 2 +- docs/en/operations/system-tables/parts.md | 2 +- docs/en/operations/system-tables/tables.md | 2 +- docs/en/operations/system-tables/trace_log.md | 2 +- .../operations/system-tables/zookeeper_log.md | 4 +- .../utilities/clickhouse-obfuscator.md | 2 +- .../aggregate-functions/reference/cramersv.md | 4 +- .../reference/cramersvbiascorrected.md | 2 +- .../reference/groupbitand.md | 2 +- .../reference/groupbitor.md | 2 +- .../reference/groupbitxor.md | 2 +- .../reference/kolmogorovsmirnovtest.md | 4 +- .../reference/quantiletdigestweighted.md | 2 +- .../reference/stochasticlinearregression.md | 2 +- docs/en/sql-reference/data-types/datetime.md | 8 +- docs/en/sql-reference/data-types/index.md | 2 +- docs/en/sql-reference/dictionaries/index.md | 2 +- .../sql-reference/functions/bit-functions.md | 4 +- .../functions/encryption-functions.md | 8 +- docs/en/sql-reference/functions/geo/h3.md | 2 +- docs/en/sql-reference/functions/geo/s2.md | 2 +- .../sql-reference/functions/hash-functions.md | 2 +- .../functions/logical-functions.md | 2 +- .../sql-reference/functions/math-functions.md | 2 +- .../functions/other-functions.md | 4 +- .../functions/string-functions.md | 2 +- .../functions/string-replace-functions.md | 2 +- docs/en/sql-reference/functions/udf.md | 2 +- docs/en/sql-reference/operators/in.md | 2 +- .../sql-reference/statements/alter/quota.md | 2 +- .../sql-reference/statements/create/quota.md | 2 +- docs/en/sql-reference/statements/explain.md | 4 +- .../sql-reference/statements/select/from.md | 2 +- .../statements/select/order-by.md | 2 +- .../sql-reference/window-functions/index.md | 4 +- .../aspell-ignore/en/aspell-dict.txt | 2429 +++++++++++++++-- utils/check-style/check-doc-aspell | 3 + 60 files changed, 2300 insertions(+), 302 deletions(-) diff --git a/docs/en/engines/database-engines/materialized-mysql.md b/docs/en/engines/database-engines/materialized-mysql.md index 1f1c996d4bf..f7cc52e622e 100644 --- a/docs/en/engines/database-engines/materialized-mysql.md +++ b/docs/en/engines/database-engines/materialized-mysql.md @@ -119,7 +119,7 @@ When working with the `MaterializedMySQL` database engine, [ReplacingMergeTree]( The data of TIME type in MySQL is converted to microseconds in ClickHouse. -Other types are not supported. If MySQL table contains a column of such type, ClickHouse throws exception "Unhandled data type" and stops replication. +Other types are not supported. If MySQL table contains a column of such type, ClickHouse throws an exception and stops replication. ## Specifics and Recommendations {#specifics-and-recommendations} diff --git a/docs/en/engines/database-engines/materialized-postgresql.md b/docs/en/engines/database-engines/materialized-postgresql.md index 08e9f998626..33d75dc9582 100644 --- a/docs/en/engines/database-engines/materialized-postgresql.md +++ b/docs/en/engines/database-engines/materialized-postgresql.md @@ -55,7 +55,7 @@ ATTACH TABLE postgres_database.new_table; ``` :::warning -Before version 22.1, adding a table to replication left an unremoved temporary replication slot (named `{db_name}_ch_replication_slot_tmp`). If attaching tables in ClickHouse version before 22.1, make sure to delete it manually (`SELECT pg_drop_replication_slot('{db_name}_ch_replication_slot_tmp')`). Otherwise disk usage will grow. This issue is fixed in 22.1. +Before version 22.1, adding a table to replication left a non-removed temporary replication slot (named `{db_name}_ch_replication_slot_tmp`). If attaching tables in ClickHouse version before 22.1, make sure to delete it manually (`SELECT pg_drop_replication_slot('{db_name}_ch_replication_slot_tmp')`). Otherwise disk usage will grow. This issue is fixed in 22.1. ::: ## Dynamically removing tables from replication {#dynamically-removing-table-from-replication} @@ -257,7 +257,7 @@ Please note that this should be used only if it is actually needed. If there is 1. [CREATE PUBLICATION](https://postgrespro.ru/docs/postgresql/14/sql-createpublication) -- create query privilege. -2. [CREATE_REPLICATION_SLOT](https://postgrespro.ru/docs/postgrespro/10/protocol-replication#PROTOCOL-REPLICATION-CREATE-SLOT) -- replication privelege. +2. [CREATE_REPLICATION_SLOT](https://postgrespro.ru/docs/postgrespro/10/protocol-replication#PROTOCOL-REPLICATION-CREATE-SLOT) -- replication privilege. 3. [pg_drop_replication_slot](https://postgrespro.ru/docs/postgrespro/9.5/functions-admin#functions-replication) -- replication privilege or superuser. diff --git a/docs/en/engines/database-engines/sqlite.md b/docs/en/engines/database-engines/sqlite.md index fc2a6525a68..0fa3c0fff58 100644 --- a/docs/en/engines/database-engines/sqlite.md +++ b/docs/en/engines/database-engines/sqlite.md @@ -30,7 +30,7 @@ Allows to connect to [SQLite](https://www.sqlite.org/index.html) database and pe ## Specifics and Recommendations {#specifics-and-recommendations} -SQLite stores the entire database (definitions, tables, indices, and the data itself) as a single cross-platform file on a host machine. During writing SQLite locks the entire database file, therefore write operations are performed sequentially. Read operations can be multitasked. +SQLite stores the entire database (definitions, tables, indices, and the data itself) as a single cross-platform file on a host machine. During writing SQLite locks the entire database file, therefore write operations are performed sequentially. Read operations can be multi-tasked. SQLite does not require service management (such as startup scripts) or access control based on `GRANT` and passwords. Access control is handled by means of file-system permissions given to the database file itself. ## Usage Example {#usage-example} diff --git a/docs/en/engines/table-engines/integrations/hdfs.md b/docs/en/engines/table-engines/integrations/hdfs.md index b9db0fae68f..08cd88826e5 100644 --- a/docs/en/engines/table-engines/integrations/hdfs.md +++ b/docs/en/engines/table-engines/integrations/hdfs.md @@ -156,7 +156,7 @@ Similar to GraphiteMergeTree, the HDFS engine supports extended configuration us | rpc\_client\_connect\_timeout | 600 * 1000 | | rpc\_client\_read\_timeout | 3600 * 1000 | | rpc\_client\_write\_timeout | 3600 * 1000 | -| rpc\_client\_socekt\_linger\_timeout | -1 | +| rpc\_client\_socket\_linger\_timeout | -1 | | rpc\_client\_connect\_retry | 10 | | rpc\_client\_timeout | 3600 * 1000 | | dfs\_default\_replica | 3 | @@ -176,7 +176,7 @@ Similar to GraphiteMergeTree, the HDFS engine supports extended configuration us | output\_write\_timeout | 3600 * 1000 | | output\_close\_timeout | 3600 * 1000 | | output\_packetpool\_size | 1024 | -| output\_heeartbeat\_interval | 10 * 1000 | +| output\_heartbeat\_interval | 10 * 1000 | | dfs\_client\_failover\_max\_attempts | 15 | | dfs\_client\_read\_shortcircuit\_streams\_cache\_size | 256 | | dfs\_client\_socketcache\_expiryMsec | 3000 | diff --git a/docs/en/engines/table-engines/integrations/hive.md b/docs/en/engines/table-engines/integrations/hive.md index adcb73605bb..5d10e417ae3 100644 --- a/docs/en/engines/table-engines/integrations/hive.md +++ b/docs/en/engines/table-engines/integrations/hive.md @@ -6,7 +6,7 @@ sidebar_label: Hive # Hive -The Hive engine allows you to perform `SELECT` quries on HDFS Hive table. Currently it supports input formats as below: +The Hive engine allows you to perform `SELECT` queries on HDFS Hive table. Currently it supports input formats as below: - Text: only supports simple scalar column types except `binary` diff --git a/docs/en/engines/table-engines/integrations/nats.md b/docs/en/engines/table-engines/integrations/nats.md index 7f09c516d6f..a82d74e0d95 100644 --- a/docs/en/engines/table-engines/integrations/nats.md +++ b/docs/en/engines/table-engines/integrations/nats.md @@ -10,7 +10,7 @@ This engine allows integrating ClickHouse with [NATS](https://nats.io/). `NATS` lets you: -- Publish or subcribe to message subjects. +- Publish or subscribe to message subjects. - Process new messages as they become available. ## Creating a Table {#table_engine-redisstreams-creating-a-table} @@ -46,7 +46,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Required parameters: - `nats_url` – host:port (for example, `localhost:5672`).. -- `nats_subjects` – List of subject for NATS table to subscribe/publsh to. Supports wildcard subjects like `foo.*.bar` or `baz.>` +- `nats_subjects` – List of subject for NATS table to subscribe/publish to. Supports wildcard subjects like `foo.*.bar` or `baz.>` - `nats_format` – Message format. Uses the same notation as the SQL `FORMAT` function, such as `JSONEachRow`. For more information, see the [Formats](../../../interfaces/formats.md) section. Optional parameters: diff --git a/docs/en/engines/table-engines/integrations/postgresql.md b/docs/en/engines/table-engines/integrations/postgresql.md index f27d4d48f75..51b3048706f 100644 --- a/docs/en/engines/table-engines/integrations/postgresql.md +++ b/docs/en/engines/table-engines/integrations/postgresql.md @@ -57,7 +57,7 @@ or via config (since version 21.11): ``` -Some parameters can be overriden by key value arguments: +Some parameters can be overridden by key value arguments: ``` sql SELECT * FROM postgresql(postgres1, schema='schema1', table='table1'); ``` diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index 595bc0c344f..f2eaacd92a5 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -23,7 +23,7 @@ CREATE TABLE s3_engine_table (name String, value UInt32) - `NOSIGN` - If this keyword is provided in place of credentials, all the requests will not be signed. - `format` — The [format](../../../interfaces/formats.md#formats) of the file. - `aws_access_key_id`, `aws_secret_access_key` - Long-term credentials for the [AWS](https://aws.amazon.com/) account user. You can use these to authenticate your requests. Parameter is optional. If credentials are not specified, they are used from the configuration file. For more information see [Using S3 for Data Storage](../mergetree-family/mergetree.md#table_engine-mergetree-s3). -- `compression` — Compression type. Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. Parameter is optional. By default, it will autodetect compression by file extension. +- `compression` — Compression type. Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. Parameter is optional. By default, it will auto-detect compression by file extension. ### PARTITION BY @@ -140,8 +140,8 @@ The following settings can be set before query execution or placed into configur - `s3_max_get_rps` — Maximum GET requests per second rate before throttling. Default value is `0` (unlimited). - `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`. - `s3_upload_part_size_multiply_factor` - Multiply `s3_min_upload_part_size` by this factor each time `s3_multiply_parts_count_threshold` parts were uploaded from a single write to S3. Default values is `2`. -- `s3_upload_part_size_multiply_parts_count_threshold` - Each time this number of parts was uploaded to S3 `s3_min_upload_part_size multiplied` by `s3_upload_part_size_multiply_factor`. DEfault value us `500`. -- `s3_max_inflight_parts_for_one_file` - Limits the number of put requests that can be run concurenly for one object. Its number should be limited. The value `0` means unlimited. Default value is `20`. Each inflight part has a buffer with size `s3_min_upload_part_size` for the first `s3_upload_part_size_multiply_factor` parts and more when file is big enought, see `upload_part_size_multiply_factor`. With default settings one uploaded file consumes not more than `320Mb` for a file which is less than `8G`. The consumption is greater for a larger file. +- `s3_upload_part_size_multiply_parts_count_threshold` - Each time this number of parts was uploaded to S3 `s3_min_upload_part_size multiplied` by `s3_upload_part_size_multiply_factor`. Default value us `500`. +- `s3_max_inflight_parts_for_one_file` - Limits the number of put requests that can be run concurrently for one object. Its number should be limited. The value `0` means unlimited. Default value is `20`. Each in-flight part has a buffer with size `s3_min_upload_part_size` for the first `s3_upload_part_size_multiply_factor` parts and more when file is big enough, see `upload_part_size_multiply_factor`. With default settings one uploaded file consumes not more than `320Mb` for a file which is less than `8G`. The consumption is greater for a larger file. Security consideration: if malicious user can specify arbitrary S3 URLs, `s3_max_redirects` must be set to zero to avoid [SSRF](https://en.wikipedia.org/wiki/Server-side_request_forgery) attacks; or alternatively, `remote_host_filter` must be specified in server configuration. diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index f841f157376..03617a1a709 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -78,7 +78,7 @@ ENGINE = MergeTree ORDER BY id; ``` -With greater `GRANULARITY` indexes remember the data structure better. The `GRANULARITY` indicates how many granules will be used to construct the index. The more data is provided for the index, the more of it can be handled by one index and the more chances that with the right hyperparameters the index will remember the data structure better. But some indexes can't be built if they don't have enough data, so this granule will always participate in the query. For more information, see the description of indexes. +With greater `GRANULARITY` indexes remember the data structure better. The `GRANULARITY` indicates how many granules will be used to construct the index. The more data is provided for the index, the more of it can be handled by one index and the more chances that with the right hyper parameters the index will remember the data structure better. But some indexes can't be built if they don't have enough data, so this granule will always participate in the query. For more information, see the description of indexes. As the indexes are built only during insertions into table, `INSERT` and `OPTIMIZE` queries are slower than for ordinary table. At this stage indexes remember all the information about the given data. ANNIndexes should be used if you have immutable or rarely changed data and many read requests. @@ -135,7 +135,7 @@ ORDER BY id; Annoy supports `L2Distance` and `cosineDistance`. -In the `SELECT` in the settings (`ann_index_select_query_params`) you can specify the size of the internal buffer (more details in the description above or in the [original repository](https://github.com/spotify/annoy)). During the query it will inspect up to `search_k` nodes which defaults to `n_trees * n` if not provided. `search_k` gives you a run-time tradeoff between better accuracy and speed. +In the `SELECT` in the settings (`ann_index_select_query_params`) you can specify the size of the internal buffer (more details in the description above or in the [original repository](https://github.com/spotify/annoy)). During the query it will inspect up to `search_k` nodes which defaults to `n_trees * n` if not provided. `search_k` gives you a run-time trade-off between better accuracy and speed. __Example__: ``` sql diff --git a/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md b/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md index edb320a2507..7e564b23676 100644 --- a/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md +++ b/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md @@ -165,7 +165,7 @@ Performance of such a query heavily depends on the table layout. Because of that The key factors for a good performance: -- number of partitions involved in the query should be sufficiently large (more than `max_threads / 2`), otherwise query will underutilize the machine +- number of partitions involved in the query should be sufficiently large (more than `max_threads / 2`), otherwise query will under-utilize the machine - partitions shouldn't be too small, so batch processing won't degenerate into row-by-row processing - partitions should be comparable in size, so all threads will do roughly the same amount of work diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 79ced0b6ce5..1ab0f4057ff 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -856,7 +856,7 @@ Tags: - `perform_ttl_move_on_insert` — Disables TTL move on data part INSERT. By default if we insert a data part that already expired by the TTL move rule it immediately goes to a volume/disk declared in move rule. This can significantly slowdown insert in case if destination volume/disk is slow (e.g. S3). - `load_balancing` - Policy for disk balancing, `round_robin` or `least_used`. -Cofiguration examples: +Configuration examples: ``` xml @@ -1224,7 +1224,7 @@ Limit parameters (mainly for internal usage): * `max_single_read_retries` - Limits the number of attempts to read a chunk of data from Blob Storage. * `max_single_download_retries` - Limits the number of attempts to download a readable buffer from Blob Storage. * `thread_pool_size` - Limits the number of threads with which `IDiskRemote` is instantiated. -* `s3_max_inflight_parts_for_one_file` - Limits the number of put requests that can be run concurenly for one object. +* `s3_max_inflight_parts_for_one_file` - Limits the number of put requests that can be run concurrently for one object. Other parameters: * `metadata_path` - Path on local FS to store metadata files for Blob Storage. Default value is `/var/lib/clickhouse/disks//`. diff --git a/docs/en/engines/table-engines/special/executable.md b/docs/en/engines/table-engines/special/executable.md index 25049d7b46e..d976beeab6c 100644 --- a/docs/en/engines/table-engines/special/executable.md +++ b/docs/en/engines/table-engines/special/executable.md @@ -65,7 +65,7 @@ if __name__ == "__main__": main() ``` -The following `my_executable_table` is built from the output of `my_script.py`, which will generate 10 random strings everytime you run a `SELECT` from `my_executable_table`: +The following `my_executable_table` is built from the output of `my_script.py`, which will generate 10 random strings every time you run a `SELECT` from `my_executable_table`: ```sql CREATE TABLE my_executable_table ( @@ -223,4 +223,4 @@ SETTINGS pool_size = 4; ``` -ClickHouse will maintain 4 processes on-demand when your client queries the `sentiment_pooled` table. \ No newline at end of file +ClickHouse will maintain 4 processes on-demand when your client queries the `sentiment_pooled` table. diff --git a/docs/en/engines/table-engines/special/keepermap.md b/docs/en/engines/table-engines/special/keepermap.md index a1c7009b712..6ce56adbae9 100644 --- a/docs/en/engines/table-engines/special/keepermap.md +++ b/docs/en/engines/table-engines/special/keepermap.md @@ -72,7 +72,7 @@ Additionally, number of keys will have a soft limit of 4 for the number of keys. If multiple tables are created on the same ZooKeeper path, the values are persisted until there exists at least 1 table using it. As a result, it is possible to use `ON CLUSTER` clause when creating the table and sharing the data from multiple ClickHouse instances. -Of course, it's possible to manually run `CREATE TABLE` with same path on nonrelated ClickHouse instances to have same data sharing effect. +Of course, it's possible to manually run `CREATE TABLE` with same path on unrelated ClickHouse instances to have same data sharing effect. ## Supported operations {#table_engine-KeeperMap-supported-operations} diff --git a/docs/en/getting-started/example-datasets/amazon-reviews.md b/docs/en/getting-started/example-datasets/amazon-reviews.md index f35806aa66f..75e4549cb78 100644 --- a/docs/en/getting-started/example-datasets/amazon-reviews.md +++ b/docs/en/getting-started/example-datasets/amazon-reviews.md @@ -87,7 +87,7 @@ ORDER BY (marketplace, review_date, product_category); 3. We are now ready to insert the data into ClickHouse. Before we do, check out the [list of files in the dataset](https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt) and decide which ones you want to include. -4. We will insert all of the US reviews - which is about 151M rows. The following `INSERT` command uses the `s3Cluster` table function, which allows the processing of mulitple S3 files in parallel using all the nodes of your cluster. We also use a wildcard to insert any file that starts with the name `https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_`: +4. We will insert all of the US reviews - which is about 151M rows. The following `INSERT` command uses the `s3Cluster` table function, which allows the processing of multiple S3 files in parallel using all the nodes of your cluster. We also use a wildcard to insert any file that starts with the name `https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_`: ```sql INSERT INTO amazon_reviews @@ -473,4 +473,4 @@ It runs quite a bit faster - which means the cache is helping us out here: └────────────┴───────────────────────────────────────────────────────────────────────┴────────────────────┴───────┘ 50 rows in set. Elapsed: 33.954 sec. Processed 150.96 million rows, 68.95 GB (4.45 million rows/s., 2.03 GB/s.) -``` \ No newline at end of file +``` diff --git a/docs/en/getting-started/example-datasets/cell-towers.md b/docs/en/getting-started/example-datasets/cell-towers.md index 048eecb285b..a84eb5d561f 100644 --- a/docs/en/getting-started/example-datasets/cell-towers.md +++ b/docs/en/getting-started/example-datasets/cell-towers.md @@ -317,7 +317,7 @@ To build a Superset dashboard using the OpenCelliD dataset you should: Make sure that you set **SSL** on when connecting to ClickHouse Cloud or other ClickHouse systems that enforce the use of SSL. ::: - ![Add ClickHouse as a Superset datasource](@site/docs/en/getting-started/example-datasets/images/superset-connect-a-database.png) + ![Add ClickHouse as a Superset data source](@site/docs/en/getting-started/example-datasets/images/superset-connect-a-database.png) ### Add the table **cell_towers** as a Superset **dataset** @@ -364,5 +364,5 @@ The data is also available for interactive queries in the [Playground](https://p This [example](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=) will populate the username and even the query for you. -Although you cannot create tables in the Playground, you can run all of the queries and even use Superset (adjust the hostname and port number). +Although you cannot create tables in the Playground, you can run all of the queries and even use Superset (adjust the host name and port number). ::: diff --git a/docs/en/getting-started/example-datasets/github.md b/docs/en/getting-started/example-datasets/github.md index 02965ed5e33..9ed8782e512 100644 --- a/docs/en/getting-started/example-datasets/github.md +++ b/docs/en/getting-started/example-datasets/github.md @@ -806,7 +806,7 @@ FROM 31 rows in set. Elapsed: 0.043 sec. Processed 7.54 million rows, 40.53 MB (176.71 million rows/s., 950.40 MB/s.) ``` -Maybe a little more near the end of the month, but overall we keep a good even distribution. Again this is unrealiable due to the filtering of the docs filter during data insertion. +Maybe a little more near the end of the month, but overall we keep a good even distribution. Again this is unreliable due to the filtering of the docs filter during data insertion. ## Authors with the most diverse impact @@ -940,7 +940,7 @@ LIMIT 10 10 rows in set. Elapsed: 0.106 sec. Processed 798.15 thousand rows, 13.97 MB (7.51 million rows/s., 131.41 MB/s.) ``` -This makes sense because Alexey has been responsible for maintaining the Change log. But what if we use the basename of the file to identify his popular files - this allows for renames and should focus on code contributions. +This makes sense because Alexey has been responsible for maintaining the Change log. But what if we use the base name of the file to identify his popular files - this allows for renames and should focus on code contributions. [play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBiYXNlLAogICAgY291bnQoKSBBUyBjCkZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCldIRVJFIChhdXRob3IgPSAnQWxleGV5IE1pbG92aWRvdicpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKR1JPVVAgQlkgYmFzZW5hbWUocGF0aCkgQVMgYmFzZQpPUkRFUiBCWSBjIERFU0MKTElNSVQgMTA=) diff --git a/docs/en/getting-started/example-datasets/opensky.md b/docs/en/getting-started/example-datasets/opensky.md index 7093a2df04f..df28809495c 100644 --- a/docs/en/getting-started/example-datasets/opensky.md +++ b/docs/en/getting-started/example-datasets/opensky.md @@ -9,7 +9,7 @@ The data in this dataset is derived and cleaned from the full OpenSky dataset to Source: https://zenodo.org/record/5092942#.YRBCyTpRXYd -Martin Strohmeier, Xavier Olive, Jannis Lübbe, Matthias Schäfer, and Vincent Lenders +Martin Strohmeier, Xavier Olive, Jannis Luebbe, Matthias Schaefer, and Vincent Lenders "Crowdsourced air traffic data from the OpenSky Network 2019–2020" Earth System Science Data 13(2), 2021 https://doi.org/10.5194/essd-13-357-2021 diff --git a/docs/en/getting-started/example-datasets/reddit-comments.md b/docs/en/getting-started/example-datasets/reddit-comments.md index e1e372746c9..49c7bd25f9f 100644 --- a/docs/en/getting-started/example-datasets/reddit-comments.md +++ b/docs/en/getting-started/example-datasets/reddit-comments.md @@ -469,7 +469,7 @@ The response is: 10 rows in set. Elapsed: 27.824 sec. Processed 6.74 billion rows, 53.26 GB (242.22 million rows/s., 1.91 GB/s.) ``` -11. Let's see which subreddits had the biggest increase in commnents from 2018 to 2019: +11. Let's see which subreddits had the biggest increase in comments from 2018 to 2019: ```sql SELECT @@ -633,4 +633,4 @@ ORDER BY quarter ASC; └────────────┴────────────┴───────────┴──────────┘ 58 rows in set. Elapsed: 2663.751 sec. Processed 6.74 billion rows, 1.21 TB (2.53 million rows/s., 454.37 MB/s.) -``` \ No newline at end of file +``` diff --git a/docs/en/getting-started/example-datasets/youtube-dislikes.md b/docs/en/getting-started/example-datasets/youtube-dislikes.md index 5f4ef696b8b..e24c6e5a6dc 100644 --- a/docs/en/getting-started/example-datasets/youtube-dislikes.md +++ b/docs/en/getting-started/example-datasets/youtube-dislikes.md @@ -22,7 +22,7 @@ The steps below will easily work on a local install of ClickHouse too. The only ## Step-by-step instructions -1. Let's see what the data looks like. The `s3cluster` table function returns a table, so we can `DESCRIBE` the reult: +1. Let's see what the data looks like. The `s3cluster` table function returns a table, so we can `DESCRIBE` the result: ```sql DESCRIBE s3Cluster( @@ -322,7 +322,7 @@ ORDER BY month ASC; A spike of uploaders [around covid is noticeable](https://www.theverge.com/2020/3/27/21197642/youtube-with-me-style-videos-views-coronavirus-cook-workout-study-home-beauty). -### More subtitiles over time and when +### More subtitles over time and when With advances in speech recognition, it’s easier than ever to create subtitles for video with youtube adding auto-captioning in late 2009 - was the jump then? @@ -484,4 +484,4 @@ ARRAY JOIN │ 20th │ 16 │ │ 10th │ 6 │ └────────────┴─────────┘ -``` \ No newline at end of file +``` diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 4f3b4e43358..ad113c58bce 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3492,7 +3492,7 @@ Default value: `0`. ## database_replicated_initial_query_timeout_sec {#database_replicated_initial_query_timeout_sec} -Sets how long initial DDL query should wait for Replicated database to precess previous DDL queue entries in seconds. +Sets how long initial DDL query should wait for Replicated database to process previous DDL queue entries in seconds. Possible values: diff --git a/docs/en/operations/system-tables/columns.md b/docs/en/operations/system-tables/columns.md index ccdc2d8c742..2915b053458 100644 --- a/docs/en/operations/system-tables/columns.md +++ b/docs/en/operations/system-tables/columns.md @@ -28,7 +28,7 @@ The `system.columns` table contains the following columns (the column type is sh - `is_in_sampling_key` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Flag that indicates whether the column is in the sampling key expression. - `compression_codec` ([String](../../sql-reference/data-types/string.md)) — Compression codec name. - `character_octet_length` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum length in bytes for binary data, character data, or text data and images. In ClickHouse makes sense only for `FixedString` data type. Otherwise, the `NULL` value is returned. -- `numeric_precision` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Accuracy of approximate numeric data, exact numeric data, integer data, or monetary data. In ClickHouse it is bitness for integer types and decimal precision for `Decimal` types. Otherwise, the `NULL` value is returned. +- `numeric_precision` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Accuracy of approximate numeric data, exact numeric data, integer data, or monetary data. In ClickHouse it is bit width for integer types and decimal precision for `Decimal` types. Otherwise, the `NULL` value is returned. - `numeric_precision_radix` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The base of the number system is the accuracy of approximate numeric data, exact numeric data, integer data or monetary data. In ClickHouse it's 2 for integer types and 10 for `Decimal` types. Otherwise, the `NULL` value is returned. - `numeric_scale` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The scale of approximate numeric data, exact numeric data, integer data, or monetary data. In ClickHouse makes sense only for `Decimal` types. Otherwise, the `NULL` value is returned. - `datetime_precision` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Decimal precision of `DateTime64` data type. For other data types, the `NULL` value is returned. diff --git a/docs/en/operations/system-tables/dropped_tables.md b/docs/en/operations/system-tables/dropped_tables.md index 144c03109ac..e2a09094c87 100644 --- a/docs/en/operations/system-tables/dropped_tables.md +++ b/docs/en/operations/system-tables/dropped_tables.md @@ -12,7 +12,7 @@ Columns: - `table` ([String](../../sql-reference/data-types/string.md)) — Table name. - `uuid` ([UUID](../../sql-reference/data-types/uuid.md)) — Table uuid. - `engine` ([String](../../sql-reference/data-types/string.md)) — Table engine name. -- `metadata_dropped_path` ([String](../../sql-reference/data-types/string.md)) — Path of table's metadata file in metadate_dropped directory. +- `metadata_dropped_path` ([String](../../sql-reference/data-types/string.md)) — Path of table's metadata file in metadata_dropped directory. - `table_dropped_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — The time when the next attempt to remove table's data is scheduled on. Usually it's the table when the table was dropped plus `database_atomic_delay_before_drop_table_sec` **Example** diff --git a/docs/en/operations/system-tables/information_schema.md b/docs/en/operations/system-tables/information_schema.md index 07e9a9e2f58..35fd3a753b5 100644 --- a/docs/en/operations/system-tables/information_schema.md +++ b/docs/en/operations/system-tables/information_schema.md @@ -43,7 +43,7 @@ Columns: - `data_type` ([String](../../sql-reference/data-types/string.md)) — Column type. - `character_maximum_length` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum length in bytes for binary data, character data, or text data and images. In ClickHouse makes sense only for `FixedString` data type. Otherwise, the `NULL` value is returned. - `character_octet_length` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum length in bytes for binary data, character data, or text data and images. In ClickHouse makes sense only for `FixedString` data type. Otherwise, the `NULL` value is returned. -- `numeric_precision` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Accuracy of approximate numeric data, exact numeric data, integer data, or monetary data. In ClickHouse it is bitness for integer types and decimal precision for `Decimal` types. Otherwise, the `NULL` value is returned. +- `numeric_precision` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Accuracy of approximate numeric data, exact numeric data, integer data, or monetary data. In ClickHouse it is bit width for integer types and decimal precision for `Decimal` types. Otherwise, the `NULL` value is returned. - `numeric_precision_radix` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The base of the number system is the accuracy of approximate numeric data, exact numeric data, integer data or monetary data. In ClickHouse it's 2 for integer types and 10 for `Decimal` types. Otherwise, the `NULL` value is returned. - `numeric_scale` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The scale of approximate numeric data, exact numeric data, integer data, or monetary data. In ClickHouse makes sense only for `Decimal` types. Otherwise, the `NULL` value is returned. - `datetime_precision` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Decimal precision of `DateTime64` data type. For other data types, the `NULL` value is returned. diff --git a/docs/en/operations/system-tables/licenses.md b/docs/en/operations/system-tables/licenses.md index 9296e78c797..0f09d559d8b 100644 --- a/docs/en/operations/system-tables/licenses.md +++ b/docs/en/operations/system-tables/licenses.md @@ -3,7 +3,7 @@ slug: /en/operations/system-tables/licenses --- # licenses -Сontains licenses of third-party libraries that are located in the [contrib](https://github.com/ClickHouse/ClickHouse/tree/master/contrib) directory of ClickHouse sources. +Contains licenses of third-party libraries that are located in the [contrib](https://github.com/ClickHouse/ClickHouse/tree/master/contrib) directory of ClickHouse sources. Columns: diff --git a/docs/en/operations/system-tables/parts.md b/docs/en/operations/system-tables/parts.md index e61c6ed2ba4..9159d1e9284 100644 --- a/docs/en/operations/system-tables/parts.md +++ b/docs/en/operations/system-tables/parts.md @@ -100,7 +100,7 @@ Columns: - `move_ttl_info.expression` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Array of expressions. Each expression defines a [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-ttl). :::note -The `move_ttl_info.expression` array is kept mostly for backward compatibility, now the simpliest way to check `TTL MOVE` rule is to use the `move_ttl_info.min` and `move_ttl_info.max` fields. +The `move_ttl_info.expression` array is kept mostly for backward compatibility, now the simplest way to check `TTL MOVE` rule is to use the `move_ttl_info.min` and `move_ttl_info.max` fields. ::: - `move_ttl_info.min` ([Array](../../sql-reference/data-types/array.md)([DateTime](../../sql-reference/data-types/datetime.md))) — Array of date and time values. Each element describes the minimum key value for a [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-ttl). diff --git a/docs/en/operations/system-tables/tables.md b/docs/en/operations/system-tables/tables.md index 82e9fa206ea..e4461e14236 100644 --- a/docs/en/operations/system-tables/tables.md +++ b/docs/en/operations/system-tables/tables.md @@ -50,7 +50,7 @@ Columns: - [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) - [Distributed](../../engines/table-engines/special/distributed.md#distributed) -- `total_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - Total number of rows, if it is possible to quickly determine exact number of rows in the table, otherwise `NULL` (including underying `Buffer` table). +- `total_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - Total number of rows, if it is possible to quickly determine exact number of rows in the table, otherwise `NULL` (including underlying `Buffer` table). - `total_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - Total number of bytes, if it is possible to quickly determine exact number of bytes for the table on storage, otherwise `NULL` (does not includes any underlying storage). diff --git a/docs/en/operations/system-tables/trace_log.md b/docs/en/operations/system-tables/trace_log.md index a5aae422be7..89d54adc30d 100644 --- a/docs/en/operations/system-tables/trace_log.md +++ b/docs/en/operations/system-tables/trace_log.md @@ -43,7 +43,7 @@ Columns: - `event` ([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md)) - For trace type `ProfileEvent` is the name of updated profile event, for other trace types is an empty string. -- `increment` ([UInt64](../../sql-reference/data-types/int-uint.md)) - For trace type `ProfileEvent` is the amount of incremnt of profile event, for other trace types is 0. +- `increment` ([UInt64](../../sql-reference/data-types/int-uint.md)) - For trace type `ProfileEvent` is the amount of increment of profile event, for other trace types is 0. **Example** diff --git a/docs/en/operations/system-tables/zookeeper_log.md b/docs/en/operations/system-tables/zookeeper_log.md index 970ed192a48..b7cc4e22cd6 100644 --- a/docs/en/operations/system-tables/zookeeper_log.md +++ b/docs/en/operations/system-tables/zookeeper_log.md @@ -33,7 +33,7 @@ Columns with request response parameters: - `zxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — ZooKeeper transaction ID. The serial number issued by the ZooKeeper server in response to a successfully executed request (`0` if the request was not executed/returned an error/the client does not know whether the request was executed). - `error` ([Nullable(Enum)](../../sql-reference/data-types/nullable.md)) — Error code. Can have many values, here are just some of them: - - `ZOK` — The request was executed seccessfully. + - `ZOK` — The request was executed successfully. - `ZCONNECTIONLOSS` — The connection was lost. - `ZOPERATIONTIMEOUT` — The request execution timeout has expired. - `ZSESSIONEXPIRED` — The session has expired. @@ -43,7 +43,7 @@ Columns with request response parameters: - `path_created` ([String](../../sql-reference/data-types/string.md)) — The path to the created ZooKeeper node (for responses to the `CREATE` request), may differ from the `path` if the node is created as a `sequential`. - `stat_czxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — The `zxid` of the change that caused this ZooKeeper node to be created. - `stat_mzxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — The `zxid` of the change that last modified this ZooKeeper node. -- `stat_pzxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — The transaction ID of the change that last modified childern of this ZooKeeper node. +- `stat_pzxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — The transaction ID of the change that last modified children of this ZooKeeper node. - `stat_version` ([Int32](../../sql-reference/data-types/int-uint.md)) — The number of changes to the data of this ZooKeeper node. - `stat_cversion` ([Int32](../../sql-reference/data-types/int-uint.md)) — The number of changes to the children of this ZooKeeper node. - `stat_dataLength` ([Int32](../../sql-reference/data-types/int-uint.md)) — The length of the data field of this ZooKeeper node. diff --git a/docs/en/operations/utilities/clickhouse-obfuscator.md b/docs/en/operations/utilities/clickhouse-obfuscator.md index 077489ba76e..ad51e9c7776 100644 --- a/docs/en/operations/utilities/clickhouse-obfuscator.md +++ b/docs/en/operations/utilities/clickhouse-obfuscator.md @@ -24,7 +24,7 @@ It is designed to retain the following properties of data: Most of the properties above are viable for performance testing: -reading data, filtering, aggregatio, and sorting will work at almost the same speed +reading data, filtering, aggregation, and sorting will work at almost the same speed as on original data due to saved cardinalities, magnitudes, compression ratios, etc. It works in a deterministic fashion: you define a seed value and the transformation is determined by input data and by seed. diff --git a/docs/en/sql-reference/aggregate-functions/reference/cramersv.md b/docs/en/sql-reference/aggregate-functions/reference/cramersv.md index f412724ea08..e9e2c367610 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/cramersv.md +++ b/docs/en/sql-reference/aggregate-functions/reference/cramersv.md @@ -5,7 +5,7 @@ sidebar_position: 351 # cramersV -[Cramér's V](https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V) (sometimes referred to as Cramér's phi) is a measure of association between two columns in a table. The result of the `cramersV` function ranges from 0 (corresponding to no association between the variables) to 1 and can reach 1 only when each value is completely determined by the other. It may be viewed as the association between two variables as a percentage of their maximum possible variation. +[Cramer's V](https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V) (sometimes referred to as Cramer's phi) is a measure of association between two columns in a table. The result of the `cramersV` function ranges from 0 (corresponding to no association between the variables) to 1 and can reach 1 only when each value is completely determined by the other. It may be viewed as the association between two variables as a percentage of their maximum possible variation. **Syntax** @@ -69,4 +69,4 @@ Result: ┌─────cramersV(a, b)─┐ │ 0.8944271909999159 │ └────────────────────┘ -``` \ No newline at end of file +``` diff --git a/docs/en/sql-reference/aggregate-functions/reference/cramersvbiascorrected.md b/docs/en/sql-reference/aggregate-functions/reference/cramersvbiascorrected.md index 8e577efbc4d..f5ad3a8a937 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/cramersvbiascorrected.md +++ b/docs/en/sql-reference/aggregate-functions/reference/cramersvbiascorrected.md @@ -6,7 +6,7 @@ sidebar_position: 352 # cramersVBiasCorrected -Cramér's V is a measure of association between two columns in a table. The result of the [`cramersV` function](./cramersv.md) ranges from 0 (corresponding to no association between the variables) to 1 and can reach 1 only when each value is completely determined by the other. The function can be heavily biased, so this version of Cramér's V uses the [bias correction](https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V#Bias_correction). +Cramer's V is a measure of association between two columns in a table. The result of the [`cramersV` function](./cramersv.md) ranges from 0 (corresponding to no association between the variables) to 1 and can reach 1 only when each value is completely determined by the other. The function can be heavily biased, so this version of Cramer's V uses the [bias correction](https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V#Bias_correction). diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitand.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitand.md index 5f57407a419..3d833555a43 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/groupbitand.md +++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitand.md @@ -5,7 +5,7 @@ sidebar_position: 125 # groupBitAnd -Applies bitwise `AND` for series of numbers. +Applies bit-wise `AND` for series of numbers. ``` sql groupBitAnd(expr) diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md index 59be69540b0..138ee998405 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md +++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md @@ -5,7 +5,7 @@ sidebar_position: 126 # groupBitOr -Applies bitwise `OR` for series of numbers. +Applies bit-wise `OR` for series of numbers. ``` sql groupBitOr(expr) diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitxor.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitxor.md index b00876a2fdf..168335a010c 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/groupbitxor.md +++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitxor.md @@ -5,7 +5,7 @@ sidebar_position: 127 # groupBitXor -Applies bitwise `XOR` for series of numbers. +Applies bit-wise `XOR` for series of numbers. ``` sql groupBitXor(expr) diff --git a/docs/en/sql-reference/aggregate-functions/reference/kolmogorovsmirnovtest.md b/docs/en/sql-reference/aggregate-functions/reference/kolmogorovsmirnovtest.md index 3da9645181e..d159eec7ce6 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/kolmogorovsmirnovtest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/kolmogorovsmirnovtest.md @@ -30,11 +30,11 @@ Samples must belong to continuous, one-dimensional probability distributions. The null hypothesis is that samples come from the same distribution, e.g. F(x) = G(x) for all x. And the alternative is that the distributions are not identical. - `'greater'` - The null hypothesis is that values in the first sample are *stohastically smaller* than those in the second one, + The null hypothesis is that values in the first sample are *stochastically smaller* than those in the second one, e.g. the CDF of first distribution lies above and hence to the left of that for the second one. Which in fact means that F(x) >= G(x) for all x. And the alternative in this case is that F(x) < G(x) for at least one x. - `'less'`. - The null hypothesis is that values in the first sample are *stohastically greater* than those in the second one, + The null hypothesis is that values in the first sample are *stochastically greater* than those in the second one, e.g. the CDF of first distribution lies below and hence to the right of that for the second one. Which in fact means that F(x) <= G(x) for all x. And the alternative in this case is that F(x) > G(x) for at least one x. - `computation_method` — the method used to compute p-value. (Optional, default: `'auto'`.) [String](../../../sql-reference/data-types/string.md). diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md b/docs/en/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md index 7b9addbbdde..b3e21e0e69e 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md @@ -14,7 +14,7 @@ The result depends on the order of running the query, and is nondeterministic. When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function. :::note -Using `quantileTDigestWeighted` [is not recommended for tiny data sets](https://github.com/tdunning/t-digest/issues/167#issuecomment-828650275) and can lead to significat error. In this case, consider possibility of using [`quantileTDigest`](../../../sql-reference/aggregate-functions/reference/quantiletdigest.md) instead. +Using `quantileTDigestWeighted` [is not recommended for tiny data sets](https://github.com/tdunning/t-digest/issues/167#issuecomment-828650275) and can lead to significant error. In this case, consider possibility of using [`quantileTDigest`](../../../sql-reference/aggregate-functions/reference/quantiletdigest.md) instead. ::: **Syntax** diff --git a/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md b/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md index 9481172c25b..f7615d90790 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md +++ b/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md @@ -18,7 +18,7 @@ stochasticLinearRegression(1.0, 1.0, 10, 'SGD') 1. `learning rate` is the coefficient on step length, when gradient descent step is performed. Too big learning rate may cause infinite weights of the model. Default is `0.00001`. 2. `l2 regularization coefficient` which may help to prevent overfitting. Default is `0.1`. 3. `mini-batch size` sets the number of elements, which gradients will be computed and summed to perform one step of gradient descent. Pure stochastic descent uses one element, however having small batches(about 10 elements) make gradient steps more stable. Default is `15`. -4. `method for updating weights`, they are: `Adam` (by default), `SGD`, `Momentum`, `Nesterov`. `Momentum` and `Nesterov` require little bit more computations and memory, however they happen to be useful in terms of speed of convergance and stability of stochastic gradient methods. +4. `method for updating weights`, they are: `Adam` (by default), `SGD`, `Momentum`, `Nesterov`. `Momentum` and `Nesterov` require little bit more computations and memory, however they happen to be useful in terms of speed of convergence and stability of stochastic gradient methods. ### Usage diff --git a/docs/en/sql-reference/data-types/datetime.md b/docs/en/sql-reference/data-types/datetime.md index 059c6acdb9e..0da273e01ad 100644 --- a/docs/en/sql-reference/data-types/datetime.md +++ b/docs/en/sql-reference/data-types/datetime.md @@ -22,7 +22,7 @@ Resolution: 1 second. The point in time is saved as a [Unix timestamp](https://en.wikipedia.org/wiki/Unix_time), regardless of the time zone or daylight saving time. The time zone affects how the values of the `DateTime` type values are displayed in text format and how the values specified as strings are parsed (‘2020-01-01 05:00:01’). -Timezone agnostic unix timestamp is stored in tables, and the timezone is used to transform it to text format or back during data import/export or to make calendar calculations on the values (example: `toDate`, `toHour` functions et cetera). The time zone is not stored in the rows of the table (or in resultset), but is stored in the column metadata. +Timezone agnostic Unix timestamp is stored in tables, and the timezone is used to transform it to text format or back during data import/export or to make calendar calculations on the values (example: `toDate`, `toHour` functions etc.). The time zone is not stored in the rows of the table (or in resultset), but is stored in the column metadata. A list of supported time zones can be found in the [IANA Time Zone Database](https://www.iana.org/time-zones) and also can be queried by `SELECT * FROM system.time_zones`. [The list](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) is also available at Wikipedia. @@ -30,7 +30,7 @@ You can explicitly set a time zone for `DateTime`-type columns when creating a t The [clickhouse-client](../../interfaces/cli.md) applies the server time zone by default if a time zone isn’t explicitly set when initializing the data type. To use the client time zone, run `clickhouse-client` with the `--use_client_time_zone` parameter. -ClickHouse outputs values depending on the value of the [date_time_output_format](../../operations/settings/settings.md#settings-date_time_output_format) setting. `YYYY-MM-DD hh:mm:ss` text format by default. Additionaly you can change the output with the [formatDateTime](../../sql-reference/functions/date-time-functions.md#formatdatetime) function. +ClickHouse outputs values depending on the value of the [date_time_output_format](../../operations/settings/settings.md#settings-date_time_output_format) setting. `YYYY-MM-DD hh:mm:ss` text format by default. Additionally, you can change the output with the [formatDateTime](../../sql-reference/functions/date-time-functions.md#formatdatetime) function. When inserting data into ClickHouse, you can use different formats of date and time strings, depending on the value of the [date_time_input_format](../../operations/settings/settings.md#settings-date_time_input_format) setting. @@ -120,9 +120,9 @@ FROM dt As timezone conversion only changes the metadata, the operation has no computation cost. -## Limitations on timezones support +## Limitations on time zones support -Some timezones may not be supported completely. There are a few cases: +Some time zones may not be supported completely. There are a few cases: If the offset from UTC is not a multiple of 15 minutes, the calculation of hours and minutes can be incorrect. For example, the time zone in Monrovia, Liberia has offset UTC -0:44:30 before 7 Jan 1972. If you are doing calculations on the historical time in Monrovia timezone, the time processing functions may give incorrect results. The results after 7 Jan 1972 will be correct nevertheless. diff --git a/docs/en/sql-reference/data-types/index.md b/docs/en/sql-reference/data-types/index.md index 88663968e50..508307a0543 100644 --- a/docs/en/sql-reference/data-types/index.md +++ b/docs/en/sql-reference/data-types/index.md @@ -27,7 +27,7 @@ ClickHouse data types include: - **Aggregation function types**: use [`SimpleAggregateFunction`](./simpleaggregatefunction.md) and [`AggregateFunction`](./aggregatefunction.md) for storing the intermediate status of aggregate function results - **Nested data structures**: A [`Nested` data structure](./nested-data-structures/index.md) is like a table inside a cell - **Tuples**: A [`Tuple` of elements](./tuple.md), each having an individual type. -- **Nullable**: [`Nullable`](./nullable.md) allows you to store a value as `NULL` when a value is "missing" (instead of the column gettings its default value for the data type) +- **Nullable**: [`Nullable`](./nullable.md) allows you to store a value as `NULL` when a value is "missing" (instead of the column settings its default value for the data type) - **IP addresses**: use [`IPv4`](./domains/ipv4.md) and [`IPv6`](./domains/ipv6.md) to efficiently store IP addresses - **Geo types**: for [geographical data](./geo.md), including `Point`, `Ring`, `Polygon` and `MultiPolygon` - **Special data types**: including [`Expression`](./special-data-types/expression.md), [`Set`](./special-data-types/set.md), [`Nothing`](./special-data-types/nothing.md) and [`Interval`](./special-data-types/interval.md) diff --git a/docs/en/sql-reference/dictionaries/index.md b/docs/en/sql-reference/dictionaries/index.md index f7b4be64851..3a968992c13 100644 --- a/docs/en/sql-reference/dictionaries/index.md +++ b/docs/en/sql-reference/dictionaries/index.md @@ -984,7 +984,7 @@ SOURCE(ODBC(... invalidate_query 'SELECT update_time FROM dictionary_source wher ... ``` -For `Cache`, `ComplexKeyCache`, `SSDCache`, and `SSDComplexKeyCache` dictionaries both synchronious and asynchronous updates are supported. +For `Cache`, `ComplexKeyCache`, `SSDCache`, and `SSDComplexKeyCache` dictionaries both synchronous and asynchronous updates are supported. It is also possible for `Flat`, `Hashed`, `ComplexKeyHashed` dictionaries to only request data that was changed after the previous update. If `update_field` is specified as part of the dictionary source configuration, value of the previous update time in seconds will be added to the data request. Depends on source type (Executable, HTTP, MySQL, PostgreSQL, ClickHouse, or ODBC) different logic will be applied to `update_field` before request data from an external source. diff --git a/docs/en/sql-reference/functions/bit-functions.md b/docs/en/sql-reference/functions/bit-functions.md index 5b342fe4f24..3c07fe8bcbe 100644 --- a/docs/en/sql-reference/functions/bit-functions.md +++ b/docs/en/sql-reference/functions/bit-functions.md @@ -226,7 +226,7 @@ Result: Returns result of [logical conjuction](https://en.wikipedia.org/wiki/Logical_conjunction) (AND operator) of all bits at given positions. The countdown starts from 0 from the right to the left. -The conjuction for bitwise operations: +The conjuction for bit-wise operations: 0 AND 0 = 0 @@ -291,7 +291,7 @@ Result: Returns result of [logical disjunction](https://en.wikipedia.org/wiki/Logical_disjunction) (OR operator) of all bits at given positions. The countdown starts from 0 from the right to the left. -The disjunction for bitwise operations: +The disjunction for bit-wise operations: 0 OR 0 = 0 diff --git a/docs/en/sql-reference/functions/encryption-functions.md b/docs/en/sql-reference/functions/encryption-functions.md index 1224b7bc92b..b11bee83582 100644 --- a/docs/en/sql-reference/functions/encryption-functions.md +++ b/docs/en/sql-reference/functions/encryption-functions.md @@ -31,9 +31,9 @@ encrypt('mode', 'plaintext', 'key' [, iv, aad]) **Arguments** - `mode` — Encryption mode. [String](../../sql-reference/data-types/string.md#string). -- `plaintext` — Text thats need to be encrypted. [String](../../sql-reference/data-types/string.md#string). +- `plaintext` — Text that need to be encrypted. [String](../../sql-reference/data-types/string.md#string). - `key` — Encryption key. [String](../../sql-reference/data-types/string.md#string). -- `iv` — Initialization vector. Required for `-gcm` modes, optinal for others. [String](../../sql-reference/data-types/string.md#string). +- `iv` — Initialization vector. Required for `-gcm` modes, optional for others. [String](../../sql-reference/data-types/string.md#string). - `aad` — Additional authenticated data. It isn't encrypted, but it affects decryption. Works only in `-gcm` modes, for others would throw an exception. [String](../../sql-reference/data-types/string.md#string). **Returned value** @@ -233,7 +233,7 @@ decrypt('mode', 'ciphertext', 'key' [, iv, aad]) - `mode` — Decryption mode. [String](../../sql-reference/data-types/string.md#string). - `ciphertext` — Encrypted text that needs to be decrypted. [String](../../sql-reference/data-types/string.md#string). - `key` — Decryption key. [String](../../sql-reference/data-types/string.md#string). -- `iv` — Initialization vector. Required for `-gcm` modes, optinal for others. [String](../../sql-reference/data-types/string.md#string). +- `iv` — Initialization vector. Required for `-gcm` modes, Optional for others. [String](../../sql-reference/data-types/string.md#string). - `aad` — Additional authenticated data. Won't decrypt if this value is incorrect. Works only in `-gcm` modes, for others would throw an exception. [String](../../sql-reference/data-types/string.md#string). **Returned value** @@ -364,7 +364,7 @@ aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv]) - `mode` — Decryption mode. [String](../../sql-reference/data-types/string.md#string). - `ciphertext` — Encrypted text that needs to be decrypted. [String](../../sql-reference/data-types/string.md#string). - `key` — Decryption key. [String](../../sql-reference/data-types/string.md#string). -- `iv` — Initialization vector. Optinal. [String](../../sql-reference/data-types/string.md#string). +- `iv` — Initialization vector. Optional. [String](../../sql-reference/data-types/string.md#string). **Returned value** diff --git a/docs/en/sql-reference/functions/geo/h3.md b/docs/en/sql-reference/functions/geo/h3.md index 1f695a13598..29486c58e6a 100644 --- a/docs/en/sql-reference/functions/geo/h3.md +++ b/docs/en/sql-reference/functions/geo/h3.md @@ -12,7 +12,7 @@ A latitude and longitude pair can be transformed to a 64-bit H3 index, identifyi The H3 index is used primarily for bucketing locations and other geospatial manipulations. -The full description of the H3 system is available at [the Uber Engeneering site](https://eng.uber.com/h3/). +The full description of the H3 system is available at [the Uber Engineering site](https://eng.uber.com/h3/). ## h3IsValid diff --git a/docs/en/sql-reference/functions/geo/s2.md b/docs/en/sql-reference/functions/geo/s2.md index 63fe5ca8530..f4702eff44b 100644 --- a/docs/en/sql-reference/functions/geo/s2.md +++ b/docs/en/sql-reference/functions/geo/s2.md @@ -249,7 +249,7 @@ s2RectAdd(s2pointLow, s2pointHigh, s2Point) **Returned values** - `s2PointLow` — Low S2 cell id corresponding to the grown rectangle. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). -- `s2PointHigh` — Hight S2 cell id corresponding to the grown rectangle. Type: [UInt64](../../../sql-reference/data-types/float.md). +- `s2PointHigh` — Height S2 cell id corresponding to the grown rectangle. Type: [UInt64](../../../sql-reference/data-types/float.md). **Example** diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index 89afcca3799..2cf3408534f 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -1161,7 +1161,7 @@ wordShingleSimHashUTF8(string[, shinglesize]) **Arguments** - `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `shinglesize` — The size of a word shingle. Optinal. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). **Returned value** diff --git a/docs/en/sql-reference/functions/logical-functions.md b/docs/en/sql-reference/functions/logical-functions.md index f5a1a6aac12..17a9fbb19fe 100644 --- a/docs/en/sql-reference/functions/logical-functions.md +++ b/docs/en/sql-reference/functions/logical-functions.md @@ -31,7 +31,7 @@ Alias: The [AND Operator](../../sql-reference/operators/index.md#logical-and-ope **Returned value** - `0`, if there at least one argument evaluates to `false`, -- `NULL`, if no argumetn evaluates to `false` and at least one argument is `NULL`, +- `NULL`, if no argument evaluates to `false` and at least one argument is `NULL`, - `1`, otherwise. Type: [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). diff --git a/docs/en/sql-reference/functions/math-functions.md b/docs/en/sql-reference/functions/math-functions.md index 9851378d4fd..22492f2830b 100644 --- a/docs/en/sql-reference/functions/math-functions.md +++ b/docs/en/sql-reference/functions/math-functions.md @@ -52,7 +52,7 @@ Alias: `ln(x)` ## exp2 -Returns 2 to the power of the given argumetn +Returns 2 to the power of the given argument **Syntax** diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index efe1a77c285..41ccfe121a4 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -206,7 +206,7 @@ Type: [UInt64](../../sql-reference/data-types/int-uint.md). **Examples** -For [String](../../sql-reference/data-types/string.md) arguments the funtion returns the string length + 9 (terminating zero + length). +For [String](../../sql-reference/data-types/string.md) arguments the function returns the string length + 9 (terminating zero + length). Query: @@ -1352,7 +1352,7 @@ ORDER BY k ASC ClickHouse used the index in the same way as the previous time (`Processed 32.74 thousand rows`). The expression `k = '2017-09-15'` was not used when generating the result. -In examle the `indexHint` function allows to see adjacent dates. +In example the `indexHint` function allows to see adjacent dates. Result: diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 8662d08431c..5175bbf0615 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -393,7 +393,7 @@ Reverses a sequence of Unicode code points in a string. Assumes that the string ## format -Format the `pattern` string with the strings listed in the arguments, similar to formatting in Python. The pattern string can contain replacement fields surrounded by curly braces `{}`. Anything not contained in braces is considered literal text and copied verbatim into the output. Literal brace character can be escaped by two braces: `{{ '{{' }}` and `{{ '}}' }}`. Field names can be numbers (starting from zero) or empty (then they are implicitely given monotonically increasing numbers). +Format the `pattern` string with the strings listed in the arguments, similar to formatting in Python. The pattern string can contain replacement fields surrounded by curly braces `{}`. Anything not contained in braces is considered literal text and copied verbatim into the output. Literal brace character can be escaped by two braces: `{{ '{{' }}` and `{{ '}}' }}`. Field names can be numbers (starting from zero) or empty (then they are implicitly given monotonically increasing numbers). **Syntax** diff --git a/docs/en/sql-reference/functions/string-replace-functions.md b/docs/en/sql-reference/functions/string-replace-functions.md index 56c527d734e..74d5d747193 100644 --- a/docs/en/sql-reference/functions/string-replace-functions.md +++ b/docs/en/sql-reference/functions/string-replace-functions.md @@ -6,7 +6,7 @@ sidebar_label: Replacing in Strings # Functions for Replacing in Strings -[General strings functions](string-functions.md) and [functions for searchin in strings](string-search-functions.md) are described separately. +[General strings functions](string-functions.md) and [functions for searching in strings](string-search-functions.md) are described separately. ## replaceOne diff --git a/docs/en/sql-reference/functions/udf.md b/docs/en/sql-reference/functions/udf.md index a58c1364780..9c6b1b0c66b 100644 --- a/docs/en/sql-reference/functions/udf.md +++ b/docs/en/sql-reference/functions/udf.md @@ -19,7 +19,7 @@ A function configuration contains the following settings: - `argument` - argument description with the `type`, and optional `name` of an argument. Each argument is described in a separate setting. Specifying name is necessary if argument names are part of serialization for user defined function format like [Native](../../interfaces/formats.md#native) or [JSONEachRow](../../interfaces/formats.md#jsoneachrow). Default argument name value is `c` + argument_number. - `format` - a [format](../../interfaces/formats.md) in which arguments are passed to the command. - `return_type` - the type of a returned value. -- `return_name` - name of retuned value. Specifying return name is necessary if return name is part of serialization for user defined function format like [Native](../../interfaces/formats.md#native) or [JSONEachRow](../../interfaces/formats.md#jsoneachrow). Optional. Default value is `result`. +- `return_name` - name of returned value. Specifying return name is necessary if return name is part of serialization for user defined function format like [Native](../../interfaces/formats.md#native) or [JSONEachRow](../../interfaces/formats.md#jsoneachrow). Optional. Default value is `result`. - `type` - an executable type. If `type` is set to `executable` then single command is started. If it is set to `executable_pool` then a pool of commands is created. - `max_command_execution_time` - maximum execution time in seconds for processing block of data. This setting is valid for `executable_pool` commands only. Optional. Default value is `10`. - `command_termination_timeout` - time in seconds during which a command should finish after its pipe is closed. After that time `SIGTERM` is sent to the process executing the command. Optional. Default value is `10`. diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index 8a8c86624d2..bfad16f8365 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -222,7 +222,7 @@ It also makes sense to specify a local table in the `GLOBAL IN` clause, in case ### Distributed Subqueries and max_rows_in_set -You can use [`max_rows_in_set`](../../operations/settings/query-complexity.md#max-rows-in-set) and [`max_bytes_in_set`](../../operations/settings/query-complexity.md#max-rows-in-set) to control how much data is tranferred during distributed queries. +You can use [`max_rows_in_set`](../../operations/settings/query-complexity.md#max-rows-in-set) and [`max_bytes_in_set`](../../operations/settings/query-complexity.md#max-rows-in-set) to control how much data is transferred during distributed queries. This is specially important if the `global in` query returns a large amount of data. Consider the following sql - ```sql diff --git a/docs/en/sql-reference/statements/alter/quota.md b/docs/en/sql-reference/statements/alter/quota.md index 74a184c1479..d41e2ff0f61 100644 --- a/docs/en/sql-reference/statements/alter/quota.md +++ b/docs/en/sql-reference/statements/alter/quota.md @@ -32,7 +32,7 @@ Limit the maximum number of queries for the current user with 123 queries in 15 ALTER QUOTA IF EXISTS qA FOR INTERVAL 15 month MAX queries = 123 TO CURRENT_USER; ``` -For the default user limit the maximum execution time with half a second in 30 minutes, and limit the maximum number of queries with 321 and the maximum number of errors with 10 in 5 quaters: +For the default user limit the maximum execution time with half a second in 30 minutes, and limit the maximum number of queries with 321 and the maximum number of errors with 10 in 5 quarters: ``` sql ALTER QUOTA IF EXISTS qB FOR INTERVAL 30 minute MAX execution_time = 0.5, FOR INTERVAL 5 quarter MAX queries = 321, errors = 10 TO default; diff --git a/docs/en/sql-reference/statements/create/quota.md b/docs/en/sql-reference/statements/create/quota.md index 7c31f93fff7..c69285171ab 100644 --- a/docs/en/sql-reference/statements/create/quota.md +++ b/docs/en/sql-reference/statements/create/quota.md @@ -32,7 +32,7 @@ Limit the maximum number of queries for the current user with 123 queries in 15 CREATE QUOTA qA FOR INTERVAL 15 month MAX queries = 123 TO CURRENT_USER; ``` -For the default user limit the maximum execution time with half a second in 30 minutes, and limit the maximum number of queries with 321 and the maximum number of errors with 10 in 5 quaters: +For the default user limit the maximum execution time with half a second in 30 minutes, and limit the maximum number of queries with 321 and the maximum number of errors with 10 in 5 quarters: ``` sql CREATE QUOTA qB FOR INTERVAL 30 minute MAX execution_time = 0.5, FOR INTERVAL 5 quarter MAX queries = 321, errors = 10 TO default; diff --git a/docs/en/sql-reference/statements/explain.md b/docs/en/sql-reference/statements/explain.md index 1c93707402f..2d7204c2796 100644 --- a/docs/en/sql-reference/statements/explain.md +++ b/docs/en/sql-reference/statements/explain.md @@ -115,7 +115,7 @@ CROSS JOIN system.numbers AS c Settings: -- `run_passes` — Run all query tree passes before dumping the query tree. Defaul: `1`. +- `run_passes` — Run all query tree passes before dumping the query tree. Default: `1`. - `dump_passes` — Dump information about used passes before dumping the query tree. Default: `0`. - `passes` — Specifies how many passes to run. If set to `-1`, runs all the passes. Default: `-1`. @@ -463,5 +463,5 @@ Result: ``` :::note -The validation is not complete, so a successfull query does not guarantee that the override would not cause issues. +The validation is not complete, so a successful query does not guarantee that the override would not cause issues. ::: diff --git a/docs/en/sql-reference/statements/select/from.md b/docs/en/sql-reference/statements/select/from.md index 4ca8e8287c0..a4f449ad321 100644 --- a/docs/en/sql-reference/statements/select/from.md +++ b/docs/en/sql-reference/statements/select/from.md @@ -34,7 +34,7 @@ Queries that use `FINAL` are executed slightly slower than similar queries that - Data is merged during query execution. - Queries with `FINAL` read primary key columns in addition to the columns specified in the query. -**In most cases, avoid using `FINAL`.** The common approach is to use different queries that assume the background processes of the `MergeTree` engine have’t happened yet and deal with it by applying aggregation (for example, to discard duplicates). +**In most cases, avoid using `FINAL`.** The common approach is to use different queries that assume the background processes of the `MergeTree` engine haven’t happened yet and deal with it by applying aggregation (for example, to discard duplicates). `FINAL` can be applied automatically using [FINAL](../../../operations/settings/settings.md#final) setting to all tables in a query using a session or a user profile. diff --git a/docs/en/sql-reference/statements/select/order-by.md b/docs/en/sql-reference/statements/select/order-by.md index 712395a0357..3dfbd133364 100644 --- a/docs/en/sql-reference/statements/select/order-by.md +++ b/docs/en/sql-reference/statements/select/order-by.md @@ -289,7 +289,7 @@ When `FROM const_expr` not defined sequence of filling use minimal `expr` field When `TO const_expr` not defined sequence of filling use maximum `expr` field value from `ORDER BY`. When `STEP const_numeric_expr` defined then `const_numeric_expr` interprets `as is` for numeric types, as `days` for Date type, as `seconds` for DateTime type. It also supports [INTERVAL](https://clickhouse.com/docs/en/sql-reference/data-types/special-data-types/interval/) data type representing time and date intervals. When `STEP const_numeric_expr` omitted then sequence of filling use `1.0` for numeric type, `1 day` for Date type and `1 second` for DateTime type. -`INTERPOLATE` can be applied to columns not participating in `ORDER BY WITH FILL`. Such columns are filled based on previous fields values by applying `expr`. If `expr` is not present will repeate previous value. Omitted list will result in including all allowed columns. +`INTERPOLATE` can be applied to columns not participating in `ORDER BY WITH FILL`. Such columns are filled based on previous fields values by applying `expr`. If `expr` is not present will repeat previous value. Omitted list will result in including all allowed columns. Example of a query without `WITH FILL`: diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index 7ee2102c14d..a8f494a5afc 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -21,7 +21,7 @@ ClickHouse supports the standard grammar for defining windows and window functio | `lag/lead(value, offset)` | Not supported. Workarounds: | | | 1) replace with `any(value) over (.... rows between preceding and preceding)`, or `following` for `lead` | | | 2) use `lagInFrame/leadInFrame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` | -| ntile(buckets) | Supported. Specify window like, (partition by x order by y rows between unbounded preceding and unounded following). | +| ntile(buckets) | Supported. Specify window like, (partition by x order by y rows between unbounded preceding and unrounded following). | ## ClickHouse-specific Window Functions @@ -39,7 +39,7 @@ The computed value is the following for each row: The roadmap for the initial support of window functions is [in this issue](https://github.com/ClickHouse/ClickHouse/issues/18097). -All GitHub issues related to window funtions have the [comp-window-functions](https://github.com/ClickHouse/ClickHouse/labels/comp-window-functions) tag. +All GitHub issues related to window functions have the [comp-window-functions](https://github.com/ClickHouse/ClickHouse/labels/comp-window-functions) tag. ### Tests diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index ded7a4643a9..583a49631a3 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1,376 +1,1159 @@ personal_ws-1.1 en 543 -AArch -ACLs -AMQP -ARMv -ASLR -ASan -Actian -AddressSanitizer -AppleClang -ArrowStream -AvroConfluent -BSON -BSONEachRow -Bool -BuilderBinAarch -BuilderBinAmd -CCTOOLS -CLion -CMake -CMakeLists -CPUs -CSVWithNames -CSVWithNamesAndTypes -CamelCase -CapnProto -CentOS -ClickHouse -ClickHouse's -ClickableSquare -CodeBlock -CodeLLDB -Compat -Config -ConnectionDetails -Contrib -Ctrl -CustomSeparated -CustomSeparatedWithNames -CustomSeparatedWithNamesAndTypes -DBMSs -DateTime -DateTimes -DockerHub -Doxygen -Encodings -Enum -Eoan -FixedString -FreeBSD -Fuzzer -Fuzzers -GTest -Gb -Gcc -GoogleTest -HDDs -Heredoc -Homebrew -Homebrew's -HorizontalDivide -Hostname -INSERTed -IPv -IntN -Integrations -JSONAsObject -JSONAsString -JSONColumns -JSONColumnsWithMetadata -JSONCompact -JSONCompactColumns -JSONCompactEachRow -JSONCompactEachRowWithNames -JSONCompactEachRowWithNamesAndTypes -JSONCompactStrings -JSONCompactStringsEachRow -JSONCompactStringsEachRowWithNames -JSONCompactStringsEachRowWithNamesAndTypes -JSONEachRow -JSONEachRowWithProgress -JSONObjectEachRow -JSONStrings -JSONStringsEachRow -JSONStringsEachRowWithProgress -JSONs -Jaeger -Jemalloc -Jepsen -KDevelop -LGPL -LLDB -LLVM's -LOCALTIME -LOCALTIMESTAMP -LibFuzzer -LineAsString -LinksDeployment -LowCardinality -MEMTABLE -MSan -MVCC -MacOS -Memcheck -MemorySanitizer -MergeTree -MessagePack -MiB -MsgPack -Multiline -Multithreading -MySQLDump -NEKUDOTAYIM -NULLIF -NVME -NYPD -NuRaft -OLAP -OLTP -ObjectId -Observability -Ok -OpenSSL -OpenSUSE -OpenStack -OpenTelemetry -PAAMAYIM -ParquetMetadata -Parsers -Postgres -Precompiled -PrettyCompact -PrettyCompactMonoBlock -PrettyCompactNoEscapes -PrettyCompactNoEscapesMonoBlock -PrettyJSONEachRow -PrettyMonoBlock -PrettyNoEscapes -PrettyNoEscapesMonoBlock -PrettySpace -PrettySpaceMonoBlock -PrettySpaceNoEscapes -PrettySpaceNoEscapesMonoBlock -Protobuf -ProtobufSingle -QEMU -QTCreator -QueryCacheHits -QueryCacheMisses -RBAC -RawBLOB -RedHat -ReplicatedMergeTree -RowBinary -RowBinaryWithNames -RowBinaryWithNamesAndTypes -Runtime -SATA -SELECTs -SERIALIZABLE -SIMD -SLES -SMALLINT -SQLInsert -SQLSTATE -SSSE -Schemas -SelfManaged -Stateful -Submodules -Subqueries -TSVRaw -TSan -TabItem -TabSeparated -TabSeparatedRaw -TabSeparatedRawWithNames -TabSeparatedRawWithNamesAndTypes -TabSeparatedWithNames -TabSeparatedWithNamesAndTypes -TargetSpecific -TemplateIgnoreSpaces -Testflows -Tgz -Toolset -Tradeoff -Transactional -TwoColumnList -UBSan -UInt -UIntN -UPDATEs -Uint -Updatable -Util -Valgrind -Vectorized -VideoContainer -ViewAllLink -VirtualBox -Werror -WithNamesAndTypes -Woboq -WriteBuffer -WriteBuffers -XCode -YAML -YYYY -Yasm -Zipkin -ZooKeeper -ZooKeeper's aarch +AArch +accurateCast +accurateCastOrDefault +accurateCastOrNull +ACLs +acos +acosh +Actian +ActionsMenu +activecube +activerecord +ActiveRecord +addDays +addHours +Additionaly +addMinutes +addMonths +addQuarters +addr +AddressSanitizer +addressToLine +addressToLineWithInlines +addressToSymbol +addSeconds +addWeeks +addYears +adhoc +adviced +aggregatefunction +Aggregatefunction +AggregateFunction +aggregatingmergetree +AggregatingMergeTree +aggregatio +AggregatorThreads +AggregatorThreadsActive +aggretate +aiochclient +Akka +alculates +AlertManager +Alexey allocator +alphaTokens +ALTERs +amplab +AMPLab +AMQP analytics +ANNIndex +ANNIndexes +anonymize anonymized ansi +AnyEvent +anyheavy +anyHeavy +anyIf +anylast +anyLast +AORM +APIs +appendTrailingCharIfAbsent +AppleClang +approximatly +argmax +argMax +argmin +argMin +arguments's +argumetn +arithmetics +ARMv +arrayAll +arrayAUC +arrayAvg +arrayCompact +arrayConcat +arrayCount +arrayCumSum +arrayCumSumNonNegative +arrayDifference +arrayDistinct +arrayElement +arrayEnumerate +arrayEnumerateDense +arrayEnumerateUniq +arrayExists +arrayFill +arrayFilter +arrayFirst +arrayFirstIndex +arrayFlatten +arrayIntersect +arrayJoin +ArrayJoin +arrayLast +arrayLastIndex +arrayMap +arrayMax +arrayMin +arrayPartialReverseSort +arrayPartialSort +arrayPopBack +arrayPopFront +arrayProduct +arrayPushBack +arrayPushFront +arrayReduce +arrayReduceInRanges +arrayResize +arrayReverse +arrayReverseFill +arrayReverseSort +arrayReverseSplit +arraySlice +arraySort +arraySplit +arrayStringConcat +arraySum +arrayUniq +arrayWithConstant +arrayZip +ArrowStream +ASan +ascii +asin +asinh +ASLR +ASOF +assumeNotNull +asterics async +asynch +AsynchronousHeavyMetricsCalculationTimeSpent +AsynchronousHeavyMetricsUpdateInterval +AsynchronousInsert +AsynchronousInsertThreads +AsynchronousInsertThreadsActive +AsynchronousMetricsCalculationTimeSpent +AsynchronousMetricsUpdateInterval +AsynchronousReadWait +AsyncInsertCacheSize +atan +atanh atomicity +auth +authenticator +Authenticator +authenticators +Authenticators +autocompletion +Autocompletion +autodetect +autodetected autogenerated autogeneration +AutoML autostart +avgweighted +avgWeighted avro +AvroConfluent avx aws backend +BackgroundBufferFlushSchedulePool +BackgroundBufferFlushSchedulePoolSize +BackgroundBufferFlushSchedulePoolTask +BackgroundCommonPoolSize +BackgroundCommonPoolTask +BackgroundDistributedSchedulePool +BackgroundDistributedSchedulePoolSize +BackgroundDistributedSchedulePoolTask +BackgroundFetchesPoolSize +BackgroundFetchesPoolTask +BackgroundMergesAndMutationsPoolSize +BackgroundMergesAndMutationsPoolTask +BackgroundMessageBrokerSchedulePoolSize +BackgroundMessageBrokerSchedulePoolTask +BackgroundMovePoolSize +BackgroundMovePoolTask +BackgroundProcessingPool +BackgroundSchedulePool +BackgroundSchedulePoolSize +BackgroundSchedulePoolTask backoff backticks +BackupsIO +BackupsIOThreads +BackupsIOThreadsActive +BackupsThreads +BackupsThreadsActive +balancer +basename +bcrypt benchmarking +BestEffort +BestEffortOrNull +BestEffortOrZero +BestEffortUS +BestEffortUSOrNull +BestEffortUSOrZero +bfloat +BIGINT +BIGSERIAL +binlog +bitAnd +bitCount +bitHammingDistance +bitmapAnd +bitmapAndCardinality +bitmapAndnot +bitmapAndnotCardinality +bitmapBuild +bitmapCardinality +bitmapContains +bitmapHasAll +bitmapHasAny +bitmapMax +bitmapMin +bitmapOr +bitmapOrCardinality +bitmapSubsetInRange +bitmapSubsetLimit +bitmapToArray +bitmapTransform +bitmapXor +bitmapXorCardinality +bitmask +bitmaskToArray +bitmaskToList +bitNot +bitOr +bitov +bitPositionsToArray +bitRotateLeft +bitRotateRight +bitShiftLeft +bitShiftRight +bitSlice +bitTest +bitTestAll +bitTestAny +bitXor blake +Blazingly +BlockActiveTime +BlockDiscardBytes +BlockDiscardMerges +BlockDiscardOps +BlockDiscardTime +BlockInFlightOps +blockinfo +blockNumber +BlockQueueTime +BlockReadBytes +blockreader +BlockReadMerges +BlockReadOps +BlockReadTime +blockSerializedSize +blocksize blockSize +BlockWriteBytes +BlockWriteMerges +BlockWriteOps +BlockWriteTime +bool +Bool boolean bools boringssl +BORO +bozerkins +broadcasted +BrokenDistributedFilesToInsert brotli bson +BSON bsoneachrow +BSONEachRow +buffersize buildable +BuilderBinAarch +BuilderBinAmd +buildId +BuildID +builtins +bytebase +Bytebase +byteSize +bytesToCutForIPv +CacheDetachedFileSegments +CacheDictionaries +CacheDictionary +CacheDictionaryThreads +CacheDictionaryThreadsActive +CacheDictionaryUpdateQueueBatches +CacheDictionaryUpdateQueueKeys +CacheFileSegments +cacheSessions +cachesize +caConfig camelCase +CamelCase +CapContains capn +Cap'n capnproto +CapnProto +CapUnion +cardinalities cardinality +cartesian cassandra +casted +catboost +CatBoost +catboostEvaluate +categoricalinformationvalue +categoricalInformationValue +cathetus cbindgen +cbrt ccache +CCTOOLS cctz +CDATA +CDFs +CDMA +ceil +CellAreaM +CellAreaRads +CellsIntersect +CentOS +centroid +certificateFile +CertificateHandler +CESU +cetera cfg +chadmin +Chadmin changelog changelogs +ChannelID charset charsets +chconn checkouting checksummed checksumming checksums +childern +chproxy +chunksize +cickhouse +Cidr +CIDR +CIDRToRange +cipherList +ciphertext +Ciphertext cityhash +cityHash +CityHash +CLang cli +ClickableSquare +clickcat +ClickCat clickhouse +ClickHouse +ClickHouseClient +clickhousedb +ClickHouseMigrator +ClickHouseNIO +ClickHouse's +ClickHouseVapor +clickhousex +clickmate clickstream +clickvisual +ClickVisual +CLion +CLOB +clockhour +cLoki +CloudDetails +clusterAllReplicas cmake +CMake +CMakeLists +CMPLNT codebase +CodeBlock codec +codecs +Codecs +CODECS +CodeLLDB +codepoint +codepoints +collapsingmergetree +CollapsingMergeTree +combinator +combinators +Combinators +commnents comparising +Compat +compatitalbe +CompiledExpressionCacheBytes +CompiledExpressionCacheCount +ComplexKeyCache +ComplexKeyDirect +ComplexKeyHashed +compressability +concat +concatAssumeInjective +concatWithSeparator +concatWithSeparatorAssumeInjective +concurenly +cond +conf config +Config configs +congruential +conjuction +conjuctive +ConnectionDetails +const +Const +ContextLockWait contrib +Contrib +convergance +convertCharset coroutines +cosineDistance +countDigits +countEqual +countMatches +countSubstrings +covariates +covarpop +covarPop +covarsamp +covarSamp +covid +Covid +COVID cpp cppkafka cpu +CPUFrequencyMHz +CPUs +Cramer's +cramersv +cramersV +cramersvbiascorrected +cramersVBiasCorrected +criteo +Criteo crlf croaring cronjob +Crotty +Crowdsourced +cryptocurrencies +cryptocurrency +cryptographic csv +CSVs csvwithnames +CSVWithNames csvwithnamesandtypes +CSVWithNamesAndTypes +CTEs +Ctrl +currentDatabase +CurrentMetrics +currentProfiles +currentRoles +currentUser +customizable +customizations customseparated +CustomSeparated customseparatedwithnames +CustomSeparatedWithNames customseparatedwithnamesandtypes +CustomSeparatedWithNamesAndTypes +cutFragment +cutIPv +cutQueryString +cutQueryStringAndFragment +cutToFirstSignificantSubdomain +cutToFirstSignificantSubdomainCustom +cutToFirstSignificantSubdomainCustomWithWWW +cutToFirstSignificantSubdomainWithWWW +cutURLParameter +cutWWW cyrus +DatabaseCatalog +DatabaseCatalogThreads +DatabaseCatalogThreadsActive +DatabaseOnDisk +DatabaseOnDiskThreads +DatabaseOnDiskThreadsActive +DatabaseOrdinaryThreads +DatabaseOrdinaryThreadsActive datacenter +datacenters datafiles +datagrip +DataGrip +datalens +DataLens +datanode dataset datasets +datasource +DataTime +datatypes +DataTypes +dateName datetime +dateTime +DateTime datetimes +DateTimes +dateTimeToSnowflake +dayofyear +dbal +DBAs +DbCL +dbeaver +DBeaver +dbgen dbms +DBMSs ddl +DDLWorker +DDLWORKER +DDLWorkerThreads +DDLWorkerThreadsActive deallocation +deallocations debian +decodeURLComponent +decodeURLFormComponent +decodeXMLComponent decompressor +decrypt +DECRYPT +decrypted +Decrypted +decrypts +deduplicate +Deduplicate +deduplicated +deduplicating +deduplication +Deduplication +defaultProfiles +defaultRoles +defaultValueOfArgumentType +defaultValueOfTypeName +DelayedInserts +DeliveryTag +deltalake +deltaLake +DeltaLake +deltasum +deltaSum +deltasumtimestamp +deltaSumTimestamp +demangle +denormalize +Denormalize +denormalized denormalizing denormals +DESC deserialization deserialized +deserializing +DestroyAggregatesThreads +DestroyAggregatesThreadsActive destructor destructors +detectCharset +detectLanguage +detectLanguageMixed +detectLanguageUnknown +determinator +deterministically +DictCacheRequests +dictGet +dictGetChildren +dictGetDescendant +dictGetHierarchy +dictGetOrDefault +dictGetOrNull +dictGetUUID +dictHas +dictIsIn +disableProtocols +disjunction +disjunctions +DiskAvailable +DiskObjectStorage +DiskObjectStorageAsyncThreads +DiskObjectStorageAsyncThreadsActive +DiskSpaceReservedForMerge +DiskTotal +DiskUnreserved +DiskUsed +displaySecretsInShowAndSelect +DistributedFilesToInsert +DistributedSend +distro +divideDecimal dmesg +DockerHub +DOGEFI +domainWithoutWWW dont +dotProduct +DoubleDelta +Doxygen +dplyr dragonbox +dropoff +dumpColumnStructure durations +ECMA +ecto +Ecto +EdgeAngle +EdgeLengthKm +EdgeLengthM +EmbeddedRocksDB +embeddings +Embeddings +emptyArray +emptyArrayDate +emptyArrayDateTime +emptyArrayFloat +emptyArrayInt +emptyArrayString +emptyArrayToSingle +emptyArrayUInt +enabledProfiles +enabledRoles +encodeURLComponent +encodeURLFormComponent +encodeXMLComponent encodings +Encodings +encryptions endian +endsWith +Engeneering enum +Enum +enum's +enums +Enums +Eoan +EphemeralNode +erfc +errorCodeToName +Ethereum +evalMLMethod +everytime +ExactEdgeLengthKm +ExactEdgeLengthM +ExactEdgeLengthRads +ExecutablePool exFAT +expiryMsec +exponentialmovingaverage +exponentialMovingAverage +expr +exprN +extendedVerification +extention +ExternalDistributed +extractAll +extractAllGroups +extractAllGroupsHorizontal +extractAllGroupsVertical +extractKeyValuePairs +extractKeyValuePairsWithEscaping +extractTextFromHTML +extractURLParameter +extractURLParameterNames +extractURLParameters +ExtType +failover +Failover +farmFingerprint +farmHash +FarmHash fastops fcoverage +FFFD filesystem +filesystemAvailable +FilesystemCacheBytes +FilesystemCacheElements +FilesystemCacheFiles +FilesystemCacheReadBuffers +FilesystemCacheSize +filesystemCapacity +filesystemFree +FilesystemLogsPathAvailableBytes +FilesystemLogsPathAvailableINodes +FilesystemLogsPathTotalBytes +FilesystemLogsPathTotalINodes +FilesystemLogsPathUsedBytes +FilesystemLogsPathUsedINodes +FilesystemMainPathAvailableBytes +FilesystemMainPathAvailableINodes +FilesystemMainPathTotalBytes +FilesystemMainPathTotalINodes +FilesystemMainPathUsedBytes +FilesystemMainPathUsedINodes filesystems +finalizeAggregation +fips +FIPS +firstSignificantSubdomain +firstSignificantSubdomainCustom +fixedstring +FixedString +flamegraph flatbuffers +flink +Flink +fluentd fmtlib +ForEach +formatDateTime +formatDateTimeInJoda +formatDateTimeInJodaSyntax +formated +formatReadableDecimalSize +formatReadableQuantity +formatReadableSize +formatReadableTimeDelta +formatRow +formatRowNoNewline formatschema formatter +FOSDEM +FQDN +FreeBSD +freezed +fromModifiedJulianDay +fromModifiedJulianDayOrNull +fromUnixTimestamp +fromUnixTimestampInJodaSyntax fsync +func +funtion +fuzzBits fuzzer +Fuzzer fuzzers -gRPC +Fuzzers +Gb +Gbit +Gcc +gccMurmurHash gcem +generateRandom +GenerateRandom +generateULID +generateUUIDv +geobase +geobases +Geobases +geocode +GeoCoord +geoDistance +geohash +Geohash +geohashDecode +geohashEncode +geohashesInBox +Geoid +geoip +geospatial +geoToH +geoToS +GetBaseCell +getblockinfo +GetDestinationIndexFromUnidirectionalEdge +getevents +GetFaces +GetIndexesFromUnidirectionalEdge +getMacro +GetNeighbors +GetOriginIndexFromUnidirectionalEdge +getOSKernelVersion +GetPentagonIndexes +GetRes +GetResolution +getServerPort +getSetting +getSizeOfEnumType +GetUnidirectionalEdge +GetUnidirectionalEdgeBoundary +GetUnidirectionalEdgesFromHexagon github +GitLab glibc +globalIn +globalNotIn +GlobalThread +GlobalThreadActive +glushkovds +GoLand +golang googletest +GoogleTest +grafana +Grafana +graphitemergetree +GraphiteMergeTree +graphouse +graphql +GraphQL +greatCircleAngle +greatCircleDistance +greaterOrEquals +greenspace +Greenwald +grouparray +groupArray +grouparrayinsertat +groupArrayInsertAt +grouparraylast +groupArrayLast +grouparraymovingavg +groupArrayMovingAvg +grouparraymovingsum +groupArrayMovingSum +grouparraysample +groupArraySample +groupbitand +groupBitAnd +groupbitmap +groupBitmap +groupbitmapand +groupBitmapAnd +groupbitmapor +groupBitmapOr +groupbitmapxor +groupBitmapXor +groupbitor +groupBitOr +groupbitxor +groupBitXor +groupuniqarray +groupUniqArray grpc +gRPC grpcio gtest +GTest +gtid +GTID +gzip +gzipped +hadoop +halfday +halfMD hardlinks +hasAll +hasAny +hasColumnInTable +HashedDictionary +HashedDictionaryThreads +HashedDictionaryThreadsActive +hashtables +hasSubstr +hasToken +hasTokenCaseInsensitive +hasTokenCaseInsensitiveOrNull +hasTokenOrNull +haversine +Haversine +have't +hdbc +HDDs hdfs +hdfsCluster heredoc +Heredoc heredocs +HexAreaKm +HexAreaM +HexRing +HHMM +Hight +hiveHash +HMAC +holistics +Holistics homebrew +Homebrew +Homebrew's +hopEnd +hopStart +horgh +HorizontalDivide +hostname +hostName +Hostname +hostnames +houseops +HouseOps +hsts +HSTS +html http +HTTPConnection https +HTTPThreads +hudi +Hudi +HyperLogLog hyperscan +hypot +Hypot +hyvor +IANA +icosahedron icudata +idempotency +identifiant +Identifiant +ifNotFinite +ifNull +iframe +ilike +IMDS +implicitely +incrementing +Incrementing +incremnt +IndexesAreNeighbors +indexHint +indexOf +infi +INFILE +InfluxDB +initializeAggregation +initialQueryID +injective +innogames +inodes +INSERTed +INSERTs +Instana instantiation +intDiv +intDivOrZero integrational integrations +Integrations +IntelliJ interserver +InterserverConnection +InterserverThreads +intervalLengthSum +intExp +intHash +IntN +introspections +invalidCertificateHandler invariants +invertedindexes +IOPrefetchThreads +IOPrefetchThreadsActive +IOThreads +IOThreadsActive +IOUringInFlightEvents +IOUringPendingEvents +IOWriterThreads +IOWriterThreadsActive +IPTrie +IPv +isConstant +isDecimalOverflow +isFinite +isInfinite +isIPAddressInRange +isIPv +isNaN +isNotNull +isNull +IsPentagon +IsResClassIII +IsValid +isValidJSON +isValidUTF +iteratively +Jaeger +Jannis +javaHash +JavaHash +javaHashUTF +jbod +JBOD jdbc jemalloc +Jemalloc +Jepsen +JetBrains +Jitter +Joda +JOINed +joinGet +JOINs json +JSONArrayLength +JSONAsObject jsonasstring +JSONAsString jsoncolumns +JSONColumns jsoncolumnsmonoblock +JSONColumnsWithMetadata jsoncompact +JSONCompact jsoncompactcolumns +JSONCompactColumns jsoncompacteachrow +JSONCompactEachRow jsoncompacteachrowwithnames +JSONCompactEachRowWithNames jsoncompacteachrowwithnamesandtypes +JSONCompactEachRowWithNamesAndTypes jsoncompactstrings +JSONCompactStrings jsoncompactstringseachrow +JSONCompactStringsEachRow jsoncompactstringseachrowwithnames +JSONCompactStringsEachRowWithNames jsoncompactstringseachrowwithnamesandtypes +JSONCompactStringsEachRowWithNamesAndTypes jsoneachrow +JSONEachRow jsoneachrowwithprogress +JSONEachRowWithProgress +JSONExtract +JSONExtractArrayRaw +JSONExtractBool +JSONExtractFloat +JSONExtractInt +JSONExtractKeys +JSONExtractKeysAndValues +JSONExtractKeysAndValuesRaw +JSONExtractRaw +JSONExtractString +JSONExtractUInt +JSONHas +JSONLength jsonobjecteachrow +JSONObjectEachRow +JSONs jsonstrings +JSONStrings jsonstringseachrow +JSONStringsEachRow jsonstringseachrowwithprogress +JSONStringsEachRowWithProgress +JSONType +jumpConsistentHash +JumpConsistentHash +Jupyter kafka +KafkaAssignedPartitions +KafkaBackgroundReads kafkacat +KafkaConsumers +KafkaConsumersInUse +KafkaConsumersWithAssignment +KafkaLibrdkafkaThreads +kafkaMurmurHash +KafkaProducers +KafkaWrites +Kahan +KDevelop +KeeperAliveConnections +keepermap +KeeperMap +KeeperOutstandingRequets +kerberized +kerberos +Kerberos +kernal +keyspace +keytab +Khanna +kittenhouse +KittenHouse +Klickhouse +Kolmogorov +kolmogorovsmirnovtest +kolmogorovSmirnovTest +kolya konsole +kRing +Kubernetes +kurtosis +kurtpop +kurtPop +kurtsamp +kurtSamp laion +lang +laravel latencies +ldap +LDAP +learing +leftPad +leftPadUTF +lemmatization +lemmatize +lemmatized +lengthUTF +lessOrEquals lexicographically -libFuzzer +lgamma +LGPL libc +libcatboost libcpuid libcxx libcxxabi libdivide libfarmhash libfuzzer +libFuzzer +LibFuzzer libgsasl libhdfs libmetrohash @@ -381,193 +1164,1405 @@ libs libunwind libuv libvirt +LightHouse linearizability linearizable +linearized lineasstring +LineAsString linefeeds lineorder +Linf +LinfDistance +LinfNorm +LinfNormalize +LinksDeployment +Linq linux +LLDB llvm +LLVM's +LoadAverage +loadDefaultCAFile localhost +localread +LocalThread +LocalThreadActive +LOCALTIME +LOCALTIMESTAMP +logagent +loghouse +LogQL +Logstash +logTrace +london +LONGLONG +LookML +lowcardinality +LowCardinality +lowercased +lowerUTF +LpDistance +LpNorm +LpNormalize +Luebbe +Lyft +lzma +MacBook +MACNumToString macOS +MacOS +MACStringToNum +MACStringToOUI +mailrugo +mailto +makeDate +makeDateTime +mannwhitneyutest +mannWhitneyUTest +mapAdd +mapAll +mapApply +mapConcat +mapContains +mapContainsKeyLike +mapExists +mapExtractKeyLike +mapFilter +mapFromArrays +mapKeys +mappedfile +mapPopulateSeries +mapReverseSort +mapSort +mapSubtract +mapUpdate +mapValues mariadb +MarkCacheBytes +MarkCacheFiles +MarksLoaderThreads +MarksLoaderThreadsActive +matcher +MaterializedMySQL +MaterializedPostgreSQL +materializedview +MaterializedView +MaxDDLEntryID +maxintersections +maxIntersections +maxintersectionsposition +maxIntersectionsPosition +maxmap +maxMap +maxmind +MaxMind +MaxPartCountForPartition +MaxPushedDDLEntryID +Mbps mdadm +meanztest +meanZTest +mebibytes +MEDIUMINT +Memcheck +MemoryCode +MemoryDataAndStack +MemoryResident +MemorySanitizer +MemoryShared +MemoryTracking +MemoryVirtual +MEMTABLE +mergeable +MergeJoin +MergeState +mergetree +MergeTree +MergeTreeAllRangesAnnouncementsSent +MergeTreeBackgroundExecutor +MergeTreeBackgroundExecutorThreads +MergeTreeBackgroundExecutorThreadsActive +MergeTreeDataSelectExecutor +MergeTreeDataSelectExecutorThreads +MergeTreeDataSelectExecutorThreadsActive +MergeTreePartsCleanerThreads +MergeTreePartsCleanerThreadsActive +MergeTreePartsLoaderThreads +MergeTreePartsLoaderThreadsActive +MergeTreeReadTaskRequestsSent +MergeTreeSettings +messageID +MessagePack +metacharacters +Metastore +metasymbols +metrica +metroHash +MetroHash +mfedotov +MiB +Milli +Milovidov +mindsdb +MindsDB +MinHash +minimalistic +mininum +MinIO miniselect +minmap +minMap +minmax +MinMax +mins +misconfiguration +mispredictions +mmap +MMapCacheCells +mmapped +MMappedAllocBytes +MMappedAllocs +MMappedFileBytes +MMappedFiles +moduloOrZero +mongodb +Mongodb +monthName +moscow +MSan msgpack +MsgPack msgpk +MSSQL +multibyte +multiFuzzyMatchAllIndices +multiFuzzyMatchAny +multiFuzzyMatchAnyIndex +multiIf multiline +Multiline +multiMatchAllIndices +multiMatchAny +multiMatchAnyIndex +multiplyDecimal +multipolygon +MultiPolygon +Multiqueries +multiSearchAllPositions +multiSearchAllPositionsUTF +multiSearchAny +multiSearchFirstIndex +multiSearchFirstPosition +multisets multithread +Multithreading +multiword +Multiword +munmap murmurhash +murmurHash +MurmurHash +musqldump mutex +MVCC +mydb +myfilter mysql +MySQLConnection mysqldump +MySQLDump mysqljs +MySQLThreads +mytable +Nagios +namedatabases +namenetworks +namenode +Namenode +namepassword +nameprofile +namequota +NamesAndTypesList +namespaces +Nano +NaNs natively +nats +NATS +NCHAR +negtive +NEKUDOTAYIM +Nesterov +nestjs +netloc +NetworkReceive +NetworkReceiveBytes +NetworkReceiveDrop +NetworkReceiveErrors +NetworkReceivePackets +NetworkSend +NetworkSendBytes +NetworkSendDrop +NetworkSendErrors +NetworkSendPackets +NEWDATE +NEWDECIMAL +NFKC +NFKD +ngram +ngrambf +ngramDistance +ngramMinHash +ngramMinHashArg +ngramMinHashArgCaseInsensitive +ngramMinHashArgCaseInsensitiveUTF +ngramMinHashArgUTF +ngramMinHashCaseInsensitive +ngramMinHashCaseInsensitiveUTF +ngramMinHashUTF +ngrams +ngramSearch +ngramSimHash +ngramSimHashCaseInsensitive +ngramSimHashCaseInsensitiveUTF +ngramSimHashUTF +NodeJs +nonNegativeDerivative noop +normalizedQueryHash +normalizeQuery +normalizeUTF +notEmpty +notEquals +notILike +notIn +notLike +notretry +nowInBlock +ntile nullability nullable +nullables +nullIf +NULLIF num +NumberOfDatabases +NumberOfDetachedByUserParts +NumberOfDetachedParts +NumberOfTables +numerics +NumHexagons +NumToString +NumToStringClassC +NuRaft +NVMe +NVME +nypd +NYPD obfuscator +ObjectId +observability +Observability +Octonica odbc +OFNS ok -openSUSE +Ok +OLAP +OLTP +omclickhouse +onstraints +ontime +OnTime +OpenCelliD +OpenFileForRead +OpenFileForWrite openldap +opensky +OpenSky +openssl +openSSL +OpenSSL +OpenStack +openSUSE +OpenSUSE opentelemetry +OpenTelemetry +optinal +Optinal +OrDefault +OrNull +OrZero +OSContextSwitches +OSGuestNiceTime +OSGuestNiceTimeCPU +OSGuestNiceTimeNormalized +OSGuestTime +OSGuestTimeCPU +OSGuestTimeNormalized +OSIdleTime +OSIdleTimeCPU +OSIdleTimeNormalized +OSInterrupts +OSIOWaitTime +OSIOWaitTimeCPU +OSIOWaitTimeNormalized +OSIrqTime +OSIrqTimeCPU +OSIrqTimeNormalized +OSMemoryAvailable +OSMemoryBuffers +OSMemoryCached +OSMemoryFreePlusCached +OSMemoryFreeWithoutCached +OSMemoryTotal +OSNiceTime +OSNiceTimeCPU +OSNiceTimeNormalized +OSOpenFiles +OSProcessesBlocked +OSProcessesCreated +OSProcessesRunning +OSSoftIrqTime +OSSoftIrqTimeCPU +OSSoftIrqTimeNormalized +OSStealTime +OSStealTimeCPU +OSStealTimeNormalized +OSSystemTime +OSSystemTimeCPU +OSSystemTimeNormalized +OSThreadsRunnable +OSThreadsTotal +OSUptime +OSUserTime +OSUserTimeCPU +OSUserTimeNormalized +OTLP +outfile +OUTFILE overcommit +overcommitted +OvercommitTracker +overfitting +PAAMAYIM +packetpool +packetsize +PagerDuty +pageviews +pandahouse +ParallelFormattingOutputFormatThreads +ParallelFormattingOutputFormatThreadsActive parallelization parallelize parallelized +ParallelParsingInputFormat +ParallelParsingInputFormatThreads +ParallelParsingInputFormatThreadsActive +Parametrized +params +paratemer +ParquetMetadata +parsable +parseable +parseDateTime +parseDateTimeBestEffort +parseDateTimeBestEffortOrNull +parseDateTimeBestEffortOrZero +parseDateTimeBestEffortUS +parseDateTimeBestEffortUSOrNull +parseDateTimeBestEffortUSOrZero +parseDateTimeInJodaSyntax +parseDateTimeInJodaSyntaxOrNull +parseDateTimeInJodaSyntaxOrZero +parseDateTimeOrNull +parseDateTimeOrZero parsers +Parsers +parseTimeDelta +Partitioner +PartMutation +PartsActive +PartsCommitted +PartsCompact +PartsDeleteOnDestroy +PartsDeleting +PartsInMemory +PartsOutdated +PartsPreActive +PartsPreCommitted +PartsTemporary +PartsWide +pathFull pclmulqdq +pcre +PCRE +PendingAsyncInsert +Percona performant +perl +persistency +phpclickhouse +PhpStorm +pipelining +plaintext +plantuml +PlantUML poco +PointDistKm +PointDistM +PointDistRads +pointInEllipses +pointInPolygon +polygonAreaCartesian +polygonAreaSpherical +polygonConvexHullCartesian +polygonPerimeterCartesian +polygonPerimeterSpherical +polygonsDistanceCartesian +polygonsDistanceSpherical +polygonsEqualsCartesian +polygonsIntersectionCartesian +polygonsIntersectionSpherical +polygonsSymDifferenceCartesian +polygonsSymDifferenceSpherical +polygonsUnionCartesian +polygonsUnionSpherical +polygonsWithinCartesian +polygonsWithinSpherical popcnt +porthttps +positionCaseInsensitive +positionCaseInsensitiveUTF +positionUTF +positiveModulo postfix postfixes +Postgres postgresql +PostgreSQLConnection +PostgreSQLThreads +PostgresSQL pre +pread +preallocate prebuild prebuilt +Precompiled preemptable +preferServerCiphers +prefertch +prefetch +prefetchsize preloaded +prepend +prepended +prepends +preprocess +Preprocess preprocessed +preprocessing preprocessor presentational prestable prettycompact +PrettyCompact prettycompactmonoblock +PrettyCompactMonoBlock prettycompactnoescapes +PrettyCompactNoEscapes prettycompactnoescapesmonoblock +PrettyCompactNoEscapesMonoBlock prettyjsoneachrow +PrettyJSONEachRow prettymonoblock +PrettyMonoBlock prettynoescapes +PrettyNoEscapes prettynoescapesmonoblock +PrettyNoEscapesMonoBlock prettyspace +PrettySpace prettyspacemonoblock +PrettySpaceMonoBlock prettyspacenoescapes +PrettySpaceNoEscapes prettyspacenoescapesmonoblock +PrettySpaceNoEscapesMonoBlock +prewhere +Prewhere +PREWHERE +privateKeyFile +privateKeyPassphraseHandler +PrivateKeyPassphraseHandler prlimit +PROCESSLIST +procfs +ProfileEvents +profiler +Profiler +profuct +Proleptic prometheus +PromHouse +Promql +PromQL +Promtail proto protobuf +Protobuf protobufsingle +ProtobufSingle +proxied +ProxySQL +pseudorandom +pseudorandomize psql ptrs +publsh +pushdown +pwrite py +PyCharm +QEMU +qouta +qryn +QTCreator +quantile +Quantile +quantilebfloat +quantileBFloat +quantiledeterministic +quantileDeterministic +quantileexact +quantileExact +quantileExactExclusive +quantileExactHigh +quantileExactInclusive +quantileExactLow +quantileexactweighted +quantileExactWeighted +quantileGK +quantileInterpolatedWeighted +quantiles +quantilesExactExclusive +quantilesExactInclusive +quantilesGK +quantilesTimingWeighted +quantiletdigest +quantileTDigest +quantiletdigestweighted +quantileTDigestWeighted +quantiletiming +quantileTiming +quantiletimingweighted +quantileTimingWeighted +quartile +quaters +QueryCacheHits +QueryCacheMisses +queryID +QueryPreempted +queryString +queryStringAndFragment +QueryThread +QuoteMeta +rabbitmq +RabbitMQ +raduis +randBernoulli +randBinomial +randCanonical +randChiSquared +randConstant +randExponential +randFisherF +randLogNormal +randNegativeBinomial +randNormal +randomFixedString +randomPrintableASCII +randomString +randomStringUTF +randPoisson +randStudentT +randUniform +RangeHashed +rankCorr rapidjson rawblob +RawBLOB +RBAC +RClickHouse readahead readline readme readonly +ReadonlyReplica +ReadTaskRequestsSent +readWKTMultiPolygon +readWKTPolygon +reate +rebalance rebalanced +recency +RecipeNLG +recompress +recompressed +recompressing +Recompressing +recompression +Recompression +reconnection +RectAdd +RectContains +RectIntersection +RectUnion +recurse +redash +Redash +reddit +Reddit +RedHat +redisstreams +ReDoS +Refactorings +refcounter +ReferenceKeyed +regexpExtract +regexpQuoteMeta +RegexpTree +regionHierarchy +regionIn +regionToArea +regionToCity +regionToContinent +regionToCountry +regionToDistrict +regionToName +regionToPopulation +regionToTopContinent +reinitialization +reinitializing +reinterpretAs +reinterpretAsDate +reinterpretAsDateTime +reinterpretAsFixedString +reinterpretAsFloat +reinterpretAsInt +reinterpretAsString +reinterpretAsUInt +reinterpretAsUUID +RemoteRead +remoteSecure +replaceAll +replaceOne +replaceRegexpAll +replaceRegexpOne +replacingmergetree +ReplacingMergeTree +ReplicasMaxAbsoluteDelay +ReplicasMaxInsertsInQueue +ReplicasMaxMergesInQueue +ReplicasMaxQueueSize +ReplicasMaxRelativeDelay +ReplicasSumInsertsInQueue +ReplicasSumMergesInQueue +ReplicasSumQueueSize +replicatable +ReplicatedAggregatingMergeTree +ReplicatedChecks +ReplicatedCollapsingMergeTree +ReplicatedFetch +ReplicatedGraphiteMergeTree +replicatedmergetree +ReplicatedMergeTree +ReplicatedReplacingMergeTree +ReplicatedSend +ReplicatedSummingMergeTree +ReplicatedVersionedCollapsingMergeTree replxx repo representable requestor +requireTLSv +Resample +resharding +reshards +RestartReplicaThreads +RestartReplicaThreadsActive +RestoreThreads +RestoreThreadsActive resultset +retentions rethrow +retransmit retriable +retuned +Returnes +reult +reverseDNSQuery +reverseUTF +RHEL +rightPad +rightPadUTF risc riscv ro +roadmap +RoaringBitmap rocksdb -rowNumberInBlock +RocksDB +rollup +Rollup +ROLLUP +roundAge +roundBankers +roundDown +roundDuration +roundToExp +routineley rowbinary +RowBinary rowbinarywithnames +RowBinaryWithNames rowbinarywithnamesandtypes +RowBinaryWithNamesAndTypes +rowNumberInAllBlocks +rowNumberInBlock rsync +rsyslog runnable runningAccumulate +runningConcurrency +runningDifference +runningDifferenceStartingWithFirstValue runtime +Runtime russian rw +RWLock +RWLockActiveReaders +RWLockActiveWriters +RWLockWaitingReaders +RWLockWaitingWriters +SaaS +Sanjeev +Sankey sasl +SATA +satisfiable +scala +Scalable +Scatterplot +Schaefer schemas +Schemas +Schwartzian +searchin +SeasClick +seccessfully +seekable +seektable +SeekTable +SELECTs +SelfManaged +Sematext +SendExternalTables +SendScalars +separatelly +sequenceCount +sequenceMatch +sequenceNextNode +SERIALIZABLE +serverUUID +sessionCacheSize +sessionIdContext +sessionTimeout +seva +shardCount +sharded +sharding +shardNum +ShareAlike +shortcircuit +shoutout +SIGTERM +SIMD simdjson +Simhash +SimHash +simpleaggregatefunction +SimpleAggregateFunction +simplelinearregression +simpleLinearRegression +SimpleState +simpliest +simpod +singlepart +sinh +siphash +sipHash +SipHash +skewness +skewpop +skewPop +skewsamp +skewSamp skippingerrors +sleepEachRow +SLES +SLRU +SMALLINT +Smirnov's +Smirnov'test +snowflakeToDateTime +socketcache +soundex +Soundex +SpanKind +sparkbar sparsehash +Spearman's +speedscope +splitByChar +splitByNonAlpha +splitByRegexp +splitByString +splitByWhitespace +SPNEGO +SQEs sql +sqlalchemy +SQLAlchemy +SQLConsoleDetail sqlinsert +SQLInsert +sqlite +SQLSTATE +sqrt src +SSDCache +SSDComplexKeyCache +SSDs +SSLManager +SSRF +SSSE +stacktrace stacktraces +startsWith +StartTime +StartTLS +StartupSystemTables +StartupSystemTablesThreads +StartupSystemTablesThreadsActive statbox stateful +Stateful +stddev +stddevpop +stddevPop +stddevsamp +stddevSamp stderr stdin stdout +stochastically +stochasticlinearregression +stochasticLinearRegression +stochasticlogisticregression +stochasticLogisticRegression +StorageBufferBytes +StorageBufferRows +StorageDistributed +StorageDistributedThreads +StorageDistributedThreadsActive +StorageHive +StorageHiveThreads +StorageHiveThreadsActive +StorageODBC +storages +StorageS +storig +stringToH +StringToNum +StringToNumOrDefault +StringToNumOrNull +stripelog +Stripelog +StripeLog +Strohmeier strtod strtoll strtoull +struct structs +studentttest +studentTTest +subarray +subarrays +subBitmap +subcolumn +subcolumns +Subcolumns +subcribe subdirectories +subdirectory +subexpression +Subexpression subexpressions +subfolder +subinterval +subintervals +subkey +submatch submodule submodules +Submodules +subnet +subnetwork subpattern subpatterns subqueries +Subqueries subquery +subranges +subreddits subseconds +substracted substring +substrings +Substrings +substringUTF +subtitiles +subtractDays +subtractHours +subtractMinutes +subtractMonths +subtractQuarters +subtractSeconds +subtractWeeks +subtractYears subtree subtype sudo +sumcount +sumCount +sumkahan +sumKahan +summap +sumMap +sumMapFiltered +summingmergetree +SummingMergeTree +sumwithoverflow +sumWithOverflow +superaggregates +Superset +SuperSet +SupersetDocker +supremum symlink symlinks +synchronious syntaxes +syscall +syscalls +syslog +syslogd systemd +SystemReplicasThreads +SystemReplicasThreadsActive +TabItem +tabix +Tabix +TablesLoaderThreads +TablesLoaderThreadsActive +TablesToDropQueueSize +tablum +TABLUM tabseparated +TabSeparated tabseparatedraw +TabSeparatedRaw tabseparatedrawwithnames +TabSeparatedRawWithNames tabseparatedrawwithnamesandtypes +TabSeparatedRawWithNamesAndTypes tabseparatedwithnames +TabSeparatedWithNames tabseparatedwithnamesandtypes +TabSeparatedWithNamesAndTypes +TargetSpecific tcp +TCPConnection +tcpnodelay +tcpPort +TCPThreads +Telegraf templateignorespaces +TemplateIgnoreSpaces +TemporaryFilesForAggregation +TemporaryFilesForJoin +TemporaryFilesForSort +TemporaryFilesUnknown +Testflows +tgamma tgz +Tgz th +thats +Theil's +theilsu +theilsU +themself +threadpool +ThreadPoolFSReaderThreads +ThreadPoolFSReaderThreadsActive +ThreadPoolRemoteFSReaderThreads +ThreadPoolRemoteFSReaderThreadsActive +ThreadsActive +ThreadsInOvercommitTracker +throwIf +timeSlot +timeSlots +Timeunit +timeZone +timeZoneOf +timeZoneOffset +timezones +TINYINT +tinylog +TinyLog +Tkachenko +TKSV +TLSv tmp +ToCenterChild +ToChildren +toColumnTypeName +toDate +toDateOrDefault +toDateOrNull +toDateOrZero +toDateTime +toDateTimeOrDefault +toDateTimeOrNull +toDateTimeOrZero +toDayOfMonth +toDayOfWeek +toDayOfYear +toDecimal +toDecimalString +toFixedString +toFloat +ToGeo +ToGeoBoundary +toHour +toInt +toInterval +toIPv +ToIPv +toISOWeek +toISOYear +toJSONString +tokenbf tokenization +tokenized +tokenizer +toLastDayOfMonth +toLowCardinality +toMinute toml +toModifiedJulianDay +toModifiedJulianDayOrNull +toMonday +toMonth +toNullable toolchain toolset +Toolset +ToParent +topk +topK +TopK +topkweighted +topKWeighted +topLevelDomain +toQuarter +toRelativeDayNum +toRelativeHourNum +toRelativeMinuteNum +toRelativeMonthNum +toRelativeQuarterNum +toRelativeSecondNum +toRelativeWeekNum +toRelativeYearNum +toSecond +ToSnowflake +toStartOfDay +toStartOfFifteenMinutes +toStartOfFiveMinutes +toStartOfHour +toStartOfInterval +toStartOfISOYear +toStartOfMinute +toStartOfMonth +toStartOfQuarter +toStartOfSecond +toStartOfTenMinutes +toStartOfWeek +toStartOfYear +toString +ToString +toStringCutToZero +TotalBytesOfMergeTreeTables +TotalPartsOfMergeTreeTables +TotalRowsOfMergeTreeTables +TotalTemporaryFiles +toTime +toTimeZone +toType +toTypeName +toUInt +toUnixTimestamp +toUUID +toUUIDOrDefault +toUUIDOrNull +toUUIDOrZero +toValidUTF +toWeek +toYear +toYearWeek +toYYYYMM +toYYYYMMDD +toYYYYMMDDhhmmss +TPCH +Tradeoff transactional +Transactional transactionally +translateUTF +translocality +trie +trimBoth +trimLeft +trimRight +trunc +tryBase +tryDecrypt +TSan +TSDB tskv tsv +TSVRaw +TSVs +TThe tui +tumbleEnd +tumbleStart +tupleDivide +tupleDivideByNumber +tupleElement +tupleHammingDistance +tupleMinus +tupleMultiply +tupleMultiplyByNumber +tupleNegate +tuplePlus +tupleToNameValuePairs turbostat +TwoColumnList txt +typename +Uber +UBSan ubuntu +UDFs uint +Uint +UInt +UIntN +ulid +ULID +ULIDStringToDateTime +UMTS unary +unbin +uncomment +UncompressedCacheBytes +UncompressedCacheCells +underying +undrop +UNDROP +unencoded unencrypted +unescaped +unescaping +unhex +unicode +unidimensional +UnidirectionalEdgeIsValid +uniq +uniqcombined +uniqCombined +uniqexact +uniqExact +uniqhll +uniqHLL +uniqtheta +uniqTheta +uniqThetaIntersect +uniqThetaNot +uniqthetasketch +uniqThetaSketch +UniqThetaSketch +uniqThetaUnion +uniqUpTo +unix unixodbc +unixODBC unoptimized +unparsed +unrealiable +unreplicated +unresolvable +unrounded +untracked +untrusted +untuple +Updatable +UPDATEs +uploaders +Uppercased +upperUTF +uptime +Uptime +uptrace +Uptrace +uring +URIs url +urlCluster +URLHash +URLHierarchy +URLPathHierarchy +urls +URL's +UserID userspace userver +Util utils uuid +UUid +UUIDNumToString +UUIDs +UUIDStringToNum +Vadim +Valgrind +VARCHAR variadic varint +varpop +varPop +varsamp +varSamp vectorized +Vectorized vectorscan +verificationDepth +verificationMode +versionedcollapsingmergetree +VersionedCollapsingMergeTree +VersionInteger +vhost +VideoContainer +ViewAllLink +VIEWs +VirtualBox +virtualized +visibleWidth +visitParam +visitParamExtractBool +visitParamExtractFloat +visitParamExtractInt +visitParamExtractRaw +visitParamExtractString +visitParamExtractUInt +visitParamHas +WALs wchc wchs webpage webserver +weekyear +Welch's +welchttest +welchTTest +Werror +Wether wget +which's whitespace whitespaces +wikistat +WikiStat +windowFunnel +WindowView +WithNames +WithNamesAndTypes +Woboq +WordNet +wordshingleMinHash +wordShingleMinHash +wordShingleMinHashArg +wordShingleMinHashArgCaseInsensitive +wordShingleMinHashArgCaseInsensitiveUTF +wordShingleMinHashArgUTF +wordShingleMinHashCaseInsensitive +wordShingleMinHashCaseInsensitiveUTF +wordShingleMinHashUTF +wordShingleSimHash +wordShingleSimHashCaseInsensitive +wordShingleSimHashCaseInsensitiveUTF +wordShingleSimHashUTF +WriteBuffer +WriteBuffers wrt xcode +XCode +Xeon +xeus +XHTML +xkcd +xlarge xml +XORs +xxHash xz -zLib -zLinux +YAML +YAMLRegExpTree +yandex +Yandex +Yasm +youtube +YYYY +zabbix +Zabbix +Zipkin zkcopy zlib +zLib +zLinux +znode znodes +ZooKeeper +ZooKeeperRequest +ZooKeeper's +ZooKeepers +ZooKeeperSession +zookeeperSessionUptime +ZooKeeperWatch zstd diff --git a/utils/check-style/check-doc-aspell b/utils/check-style/check-doc-aspell index d39769aa930..952dbd5b507 100755 --- a/utils/check-style/check-doc-aspell +++ b/utils/check-style/check-doc-aspell @@ -1,5 +1,8 @@ #!/usr/bin/env bash +# force-enable double star globbing +shopt -s globstar + # Perform spell checking on the docs if [[ ${1:-} == "--help" ]] || [[ ${1:-} == "-h" ]]; then From c3a888a47bfd68c3737dee55fe1a0a2c92fa9341 Mon Sep 17 00:00:00 2001 From: Roman Vlasenko Date: Fri, 2 Jun 2023 15:41:43 +0300 Subject: [PATCH 0309/1072] Remove needless minus sign --- docs/ru/faq/integration/json-import.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ru/faq/integration/json-import.md b/docs/ru/faq/integration/json-import.md index bc65b5a614a..a3c89aed429 100644 --- a/docs/ru/faq/integration/json-import.md +++ b/docs/ru/faq/integration/json-import.md @@ -19,7 +19,7 @@ $ echo '{"foo":"bar"}' | curl 'http://localhost:8123/?query=INSERT%20INTO%20test При помощи [интефейса CLI](../../interfaces/cli.md): ``` bash -$ echo '{"foo":"bar"}' | clickhouse-client ---query="INSERT INTO test FORMAT JSONEachRow" +$ echo '{"foo":"bar"}' | clickhouse-client --query="INSERT INTO test FORMAT JSONEachRow" ``` Чтобы не вставлять данные вручную, используйте одну из [готовых библиотек](../../interfaces/index.md). @@ -31,4 +31,4 @@ $ echo '{"foo":"bar"}' | clickhouse-client ---query="INSERT INTO test FORMAT JS :::note "Примечание" В HTTP-интерфейсе настройки передаются через параметры `GET` запроса, в `CLI` interface — как дополнительные аргументы командной строки, начинающиеся с `--`. - ::: \ No newline at end of file + ::: From 079008058adbefea63edc8afffa0dd20d694e5e5 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Fri, 2 Jun 2023 08:49:26 -0400 Subject: [PATCH 0310/1072] move from server to user settings --- .../settings.md | 51 ++----------------- docs/en/operations/settings/settings.md | 42 +++++++++++++++ 2 files changed, 46 insertions(+), 47 deletions(-) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 82be1c10dcc..d07fb80d1da 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -275,9 +275,9 @@ Type: UInt64 Default: 1000 -## max_concurrent_insert_queries +## max_concurrent_queries -Limit on total number of concurrent insert queries. Zero means Unlimited. +Limit on total number of concurrently executed queries. Zero means Unlimited. Note that limits on insert and select queries, and on the maximum number of queries for users must also be considered. See also max_concurrent_insert_queries, max_concurrent_select_queries, max_concurrent_queries_for_all_users. Zero means unlimited. :::note These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. @@ -287,9 +287,9 @@ Type: UInt64 Default: 0 -## max_concurrent_queries +## max_concurrent_insert_queries -Limit on total number of concurrently executed queries. Zero means Unlimited. Note that limits on insert and select queries, and on the maximum number of queries for users must also be considered. See also max_concurrent_insert_queries, max_concurrent_select_queries, max_concurrent_queries_for_all_users. Zero means unlimited. +Limit on total number of concurrent insert queries. Zero means Unlimited. :::note These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. @@ -1277,49 +1277,6 @@ For more information, see the section [Creating replicated tables](../../engines ``` - -## max_concurrent_queries_for_user {#max-concurrent-queries-for-user} - -The maximum number of simultaneously processed queries related to MergeTree table per user. - -Possible values: - -- Positive integer. -- 0 — No limit. - -Default value: `0`. - -**Example** - -``` xml -5 -``` - -## max_concurrent_queries_for_all_users {#max-concurrent-queries-for-all-users} - -Throw exception if the value of this setting is less or equal than the current number of simultaneously processed queries. - -Example: `max_concurrent_queries_for_all_users` can be set to 99 for all users and database administrator can set it to 100 for itself to run queries for investigation even when the server is overloaded. - -Modifying the setting for one query or user does not affect other queries. - -Possible values: - -- Positive integer. -- 0 — No limit. - -Default value: `0`. - -**Example** - -``` xml -99 -``` - -**See Also** - -- [max_concurrent_queries](#max-concurrent-queries) - ## max_open_files {#max-open-files} The maximum number of open files. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index b868299aeff..374afb6bed7 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -646,6 +646,48 @@ Used for the same purpose as `max_block_size`, but it sets the recommended block However, the block size cannot be more than `max_block_size` rows. By default: 1,000,000. It only works when reading from MergeTree engines. +## max_concurrent_queries_for_user {#max-concurrent-queries-for-user} + +The maximum number of simultaneously processed queries related to MergeTree table per user. + +Possible values: + +- Positive integer. +- 0 — No limit. + +Default value: `0`. + +**Example** + +``` xml +5 +``` + +## max_concurrent_queries_for_all_users {#max-concurrent-queries-for-all-users} + +Throw exception if the value of this setting is less or equal than the current number of simultaneously processed queries. + +Example: `max_concurrent_queries_for_all_users` can be set to 99 for all users and database administrator can set it to 100 for itself to run queries for investigation even when the server is overloaded. + +Modifying the setting for one query or user does not affect other queries. + +Possible values: + +- Positive integer. +- 0 — No limit. + +Default value: `0`. + +**Example** + +``` xml +99 +``` + +**See Also** + +- [max_concurrent_queries](/docs/en/operations/server-configuration-parameters/settings.md/#max_concurrent_queries) + ## merge_tree_min_rows_for_concurrent_read {#setting-merge-tree-min-rows-for-concurrent-read} If the number of rows to be read from a file of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table exceeds `merge_tree_min_rows_for_concurrent_read` then ClickHouse tries to perform a concurrent reading from this file on several threads. From 5cb4363e58bf1553bd80a930779cda2a79ef34b0 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 2 Jun 2023 14:44:28 +0200 Subject: [PATCH 0311/1072] Remove assertion --- src/Interpreters/Cache/FileSegment.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/Interpreters/Cache/FileSegment.cpp b/src/Interpreters/Cache/FileSegment.cpp index fb0ba0eba14..f3a21749086 100644 --- a/src/Interpreters/Cache/FileSegment.cpp +++ b/src/Interpreters/Cache/FileSegment.cpp @@ -313,12 +313,7 @@ void FileSegment::write(const char * from, size_t size, size_t offset) if (!size) throw Exception(ErrorCodes::LOGICAL_ERROR, "Writing zero size is not allowed"); - auto file_segment_path = getPathInLocalCache(); - if (offset == range().left && fs::exists(file_segment_path)) - { - fs::remove(file_segment_path); - chassert(false); - } + const auto file_segment_path = getPathInLocalCache(); { auto lock = segment_guard.lock(); @@ -358,7 +353,7 @@ void FileSegment::write(const char * from, size_t size, size_t offset) "Cache writer was finalized (downloaded size: {}, state: {})", current_downloaded_size, stateToString(download_state)); - cache_writer = std::make_unique(getPathInLocalCache()); + cache_writer = std::make_unique(file_segment_path); } } @@ -385,6 +380,7 @@ void FileSegment::write(const char * from, size_t size, size_t offset) const auto file_size = fs::file_size(file_segment_path); chassert(downloaded_size <= file_size); chassert(reserved_size >= file_size); + chassert(file_size <= range().right + 1); if (downloaded_size != file_size) downloaded_size = file_size; } From 21e9877098c24fc90319fae2682a815da8fef389 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 2 Jun 2023 15:08:04 +0200 Subject: [PATCH 0312/1072] Add assertion --- src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index a60f5dffa96..561a66a826f 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -932,18 +932,23 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep() ProfileEvents::increment(ProfileEvents::CachedReadBufferReadFromCacheBytes, size); ProfileEvents::increment(ProfileEvents::CachedReadBufferReadFromCacheMicroseconds, elapsed); -#ifdef ABORT_ON_LOGICAL_ERROR const size_t new_file_offset = file_offset_of_buffer_end + size; - chassert(new_file_offset - 1 <= file_segment.range().right); const size_t file_segment_write_offset = file_segment.getCurrentWriteOffset(true); + if (new_file_offset > file_segment.range().right + 1) + { + auto file_segment_path = file_segment.getPathInLocalCache(); + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Read unexpected size. File size: {}, file path: {}, file segment info: {}", + fs::file_size(file_segment_path), file_segment_path, file_segment.getInfoForLog()); + } if (new_file_offset > file_segment_write_offset) { - LOG_TRACE( - log, "Read {} bytes, file offset: {}, segment: {}, segment write offset: {}", + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Read unexpected size. Read {} bytes, file offset: {}, segment: {}, segment write offset: {}", size, file_offset_of_buffer_end, file_segment.range().toString(), file_segment_write_offset); - chassert(false); } -#endif } else { From e148c60d5a308b2ea86128021beba0e8321fef9b Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 2 Jun 2023 16:18:32 +0300 Subject: [PATCH 0313/1072] Fixes for MergeTree with readonly disks (#50244) * fixes for MergeTree with readonly disks * Automatic style fix * Update test.py * Automatic style fix * Update test.py * Update test.py * Automatic style fix * Update test.py --------- Co-authored-by: robot-clickhouse Co-authored-by: alesapin --- src/Storages/MergeTree/MergeTreeData.cpp | 6 +++ src/Storages/MergeTree/MergeTreeData.h | 2 +- src/Storages/StorageMergeTree.cpp | 5 +- .../test_disk_over_web_server/test.py | 50 ++++++++++++++++--- ...02435_rollback_cancelled_queries.reference | 1 - .../02435_rollback_cancelled_queries.sh | 11 ++-- 6 files changed, 61 insertions(+), 14 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 338a221e45e..32665429051 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -4854,6 +4854,9 @@ void MergeTreeData::checkAlterPartitionIsPossible( void MergeTreeData::checkPartitionCanBeDropped(const ASTPtr & partition, ContextPtr local_context) { + if (!supportsReplication() && isStaticStorage()) + return; + DataPartsVector parts_to_remove; const auto * partition_ast = partition->as(); if (partition_ast && partition_ast->all) @@ -4874,6 +4877,9 @@ void MergeTreeData::checkPartitionCanBeDropped(const ASTPtr & partition, Context void MergeTreeData::checkPartCanBeDropped(const String & part_name) { + if (!supportsReplication() && isStaticStorage()) + return; + auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active}); if (!part) throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "No part {} in committed state", part_name); diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index fce7d989a2f..1c41de6fa19 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -865,7 +865,7 @@ public: DiskPtr tryGetDiskForDetachedPart(const String & part_name) const; DiskPtr getDiskForDetachedPart(const String & part_name) const; - bool storesDataOnDisk() const override { return true; } + bool storesDataOnDisk() const override { return !isStaticStorage(); } Strings getDataPaths() const override; /// Reserves space at least 1MB. diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index a721dd30cd7..3da4724471d 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -114,7 +114,7 @@ StorageMergeTree::StorageMergeTree( loadDataParts(has_force_restore_data_flag); - if (!attach && !getDataPartsForInternalUsage().empty()) + if (!attach && !getDataPartsForInternalUsage().empty() && !isStaticStorage()) throw Exception(ErrorCodes::INCORRECT_DATA, "Data directory for table already containing data parts - probably " "it was unclean DROP table or manual intervention. " @@ -283,6 +283,9 @@ StorageMergeTree::write(const ASTPtr & /*query*/, const StorageMetadataPtr & met void StorageMergeTree::checkTableCanBeDropped() const { + if (!supportsReplication() && isStaticStorage()) + return; + auto table_id = getStorageID(); getContext()->checkTableCanBeDropped(table_id.database_name, table_id.table_name, getTotalActiveSizeInBytes()); } diff --git a/tests/integration/test_disk_over_web_server/test.py b/tests/integration/test_disk_over_web_server/test.py index fd71389f71a..719de5e8bef 100644 --- a/tests/integration/test_disk_over_web_server/test.py +++ b/tests/integration/test_disk_over_web_server/test.py @@ -10,16 +10,22 @@ def cluster(): try: cluster = ClickHouseCluster(__file__) cluster.add_instance( - "node1", main_configs=["configs/storage_conf.xml"], with_nginx=True + "node1", + main_configs=["configs/storage_conf.xml"], + with_nginx=True, ) cluster.add_instance( "node2", main_configs=["configs/storage_conf_web.xml"], with_nginx=True, stay_alive=True, + with_zookeeper=True, ) cluster.add_instance( - "node3", main_configs=["configs/storage_conf_web.xml"], with_nginx=True + "node3", + main_configs=["configs/storage_conf_web.xml"], + with_nginx=True, + with_zookeeper=True, ) cluster.add_instance( @@ -95,7 +101,7 @@ def test_usage(cluster, node_name): for i in range(3): node2.query( """ - ATTACH TABLE test{} UUID '{}' + CREATE TABLE test{} UUID '{}' (id Int32) ENGINE = MergeTree() ORDER BY id SETTINGS storage_policy = 'web'; """.format( @@ -140,7 +146,7 @@ def test_incorrect_usage(cluster): global uuids node2.query( """ - ATTACH TABLE test0 UUID '{}' + CREATE TABLE test0 UUID '{}' (id Int32) ENGINE = MergeTree() ORDER BY id SETTINGS storage_policy = 'web'; """.format( @@ -173,7 +179,7 @@ def test_cache(cluster, node_name): for i in range(3): node2.query( """ - ATTACH TABLE test{} UUID '{}' + CREATE TABLE test{} UUID '{}' (id Int32) ENGINE = MergeTree() ORDER BY id SETTINGS storage_policy = 'cached_web'; """.format( @@ -238,7 +244,7 @@ def test_unavailable_server(cluster): global uuids node2.query( """ - ATTACH TABLE test0 UUID '{}' + CREATE TABLE test0 UUID '{}' (id Int32) ENGINE = MergeTree() ORDER BY id SETTINGS storage_policy = 'web'; """.format( @@ -276,3 +282,35 @@ def test_unavailable_server(cluster): ) node2.start_clickhouse() node2.query("DROP TABLE test0 SYNC") + + +def test_replicated_database(cluster): + node1 = cluster.instances["node3"] + node1.query( + "CREATE DATABASE rdb ENGINE=Replicated('/test/rdb', 's1', 'r1')", + settings={"allow_experimental_database_replicated": 1}, + ) + + global uuids + node1.query( + """ + CREATE TABLE rdb.table0 UUID '{}' + (id Int32) ENGINE = MergeTree() ORDER BY id + SETTINGS storage_policy = 'web'; + """.format( + uuids[0] + ) + ) + + node2 = cluster.instances["node2"] + node2.query( + "CREATE DATABASE rdb ENGINE=Replicated('/test/rdb', 's1', 'r2')", + settings={"allow_experimental_database_replicated": 1}, + ) + node2.query("SYSTEM SYNC DATABASE REPLICA rdb") + + assert node1.query("SELECT count() FROM rdb.table0") == "5000000\n" + assert node2.query("SELECT count() FROM rdb.table0") == "5000000\n" + + node1.query("DROP DATABASE rdb SYNC") + node2.query("DROP DATABASE rdb SYNC") diff --git a/tests/queries/0_stateless/02435_rollback_cancelled_queries.reference b/tests/queries/0_stateless/02435_rollback_cancelled_queries.reference index 2d32c17ec7c..38ff81b2371 100644 --- a/tests/queries/0_stateless/02435_rollback_cancelled_queries.reference +++ b/tests/queries/0_stateless/02435_rollback_cancelled_queries.reference @@ -1,3 +1,2 @@ 1000000 0 -1 diff --git a/tests/queries/0_stateless/02435_rollback_cancelled_queries.sh b/tests/queries/0_stateless/02435_rollback_cancelled_queries.sh index 8f8e8cc7ee0..776d1f850b0 100755 --- a/tests/queries/0_stateless/02435_rollback_cancelled_queries.sh +++ b/tests/queries/0_stateless/02435_rollback_cancelled_queries.sh @@ -110,11 +110,12 @@ insert_data 1 $CLICKHOUSE_CLIENT --implicit_transaction=1 -q 'select throwIf(count() % 1000000 != 0 or count() = 0) from dedup_test' \ || $CLICKHOUSE_CLIENT -q "select name, rows, active, visible, creation_tid, creation_csn from system.parts where database=currentDatabase();" -# Ensure that thread_cancel actually did something -$CLICKHOUSE_CLIENT -q "select count() > 0 from system.text_log where event_date >= yesterday() and query_id like '$TEST_MARK%' and ( - message_format_string in ('Unexpected end of file while reading chunk header of HTTP chunked data', 'Unexpected EOF, got {} of {} bytes', - 'Query was cancelled or a client has unexpectedly dropped the connection') or - message like '%Connection reset by peer%' or message like '%Broken pipe, while writing to socket%')" +# Ensure that thread_cancel actually did something (useful when editing this test) +# We cannot check it in the CI, because sometimes it fails due to randomization +# $CLICKHOUSE_CLIENT -q "select count() > 0 from system.text_log where event_date >= yesterday() and query_id like '$TEST_MARK%' and ( +# message_format_string in ('Unexpected end of file while reading chunk header of HTTP chunked data', 'Unexpected EOF, got {} of {} bytes', +# 'Query was cancelled or a client has unexpectedly dropped the connection') or +# message like '%Connection reset by peer%' or message like '%Broken pipe, while writing to socket%')" wait_for_queries_to_finish 30 $CLICKHOUSE_CLIENT --database_atomic_wait_for_drop_and_detach_synchronously=0 -q "drop table dedup_test" From 8e076c33d51d6ecdfa209158d75305befd6ab308 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 2 Jun 2023 13:19:13 +0000 Subject: [PATCH 0314/1072] Try fix flaky test test_async_query_sending --- tests/integration/test_hedged_requests/configs/logger.xml | 5 +++++ tests/integration/test_hedged_requests/test.py | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 tests/integration/test_hedged_requests/configs/logger.xml diff --git a/tests/integration/test_hedged_requests/configs/logger.xml b/tests/integration/test_hedged_requests/configs/logger.xml new file mode 100644 index 00000000000..48fb4e91428 --- /dev/null +++ b/tests/integration/test_hedged_requests/configs/logger.xml @@ -0,0 +1,5 @@ + + + 20 + + \ No newline at end of file diff --git a/tests/integration/test_hedged_requests/test.py b/tests/integration/test_hedged_requests/test.py index 2ca37fbb7ee..be6cea80f87 100644 --- a/tests/integration/test_hedged_requests/test.py +++ b/tests/integration/test_hedged_requests/test.py @@ -23,7 +23,7 @@ def started_cluster(): NODES["node"] = cluster.add_instance( "node", stay_alive=True, - main_configs=["configs/remote_servers.xml"], + main_configs=["configs/remote_servers.xml", "configs/logger.xml"], user_configs=["configs/users.xml"], ) @@ -377,6 +377,9 @@ def test_async_connect(started_cluster): check_changing_replica_events(2) check_if_query_sending_was_not_suspended() + # Restart server to reset connection pool state + NODES["node"].restart_clickhouse() + NODES["node"].query( "SELECT hostName(), id FROM distributed_connect ORDER BY id LIMIT 1 SETTINGS prefer_localhost_replica = 0, connect_timeout_with_failover_ms=5000, async_query_sending_for_remote=1, max_threads=1" ) From d9a4f8115fb7f362e93dc59e38a8d649d016e0e7 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 2 Jun 2023 13:19:41 +0000 Subject: [PATCH 0315/1072] Add new line delimiter --- tests/integration/test_hedged_requests/configs/logger.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_hedged_requests/configs/logger.xml b/tests/integration/test_hedged_requests/configs/logger.xml index 48fb4e91428..b341b14d43c 100644 --- a/tests/integration/test_hedged_requests/configs/logger.xml +++ b/tests/integration/test_hedged_requests/configs/logger.xml @@ -2,4 +2,4 @@ 20 - \ No newline at end of file + From 54872f9e7ea314da7f766c8929c212fd3d07ec21 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 2 Jun 2023 13:27:56 +0000 Subject: [PATCH 0316/1072] Typos: Follow-up to #50476 --- .../mergetree-family/mergetree.md | 2 +- .../settings.md | 2 +- .../operations/settings/settings-formats.md | 2 +- docs/en/operations/settings/settings.md | 8 ++-- docs/en/operations/system-tables/quotas.md | 4 +- .../parametric-functions.md | 2 +- .../reference/exponentialmovingaverage.md | 4 +- docs/en/sql-reference/dictionaries/index.md | 6 +-- .../functions/array-functions.md | 4 +- .../functions/distance-functions.md | 2 +- .../functions/encryption-functions.md | 2 +- .../sql-reference/functions/hash-functions.md | 2 +- .../sql-reference/functions/math-functions.md | 2 +- .../functions/other-functions.md | 4 +- .../functions/type-conversion-functions.md | 2 +- docs/en/sql-reference/operators/index.md | 2 +- .../table-functions/urlCluster.md | 4 +- .../aspell-ignore/en/aspell-dict.txt | 45 ------------------- 18 files changed, 27 insertions(+), 72 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 1ab0f4057ff..07f706af91d 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -779,7 +779,7 @@ Disks, volumes and storage policies should be declared inside the ` Date: Fri, 2 Jun 2023 13:31:14 +0000 Subject: [PATCH 0317/1072] Apply uncommitted state after snapshot deser --- src/Coordination/KeeperStateMachine.cpp | 5 ++ src/Coordination/KeeperStorage.cpp | 51 ++++++++---- src/Coordination/KeeperStorage.h | 5 ++ src/Coordination/tests/gtest_coordination.cpp | 77 +++++++++++++++++++ 4 files changed, 124 insertions(+), 14 deletions(-) diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index 6635c74149a..a4568cbbdd3 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -363,6 +363,7 @@ bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s) else if (s.get_last_log_idx() < latest_snapshot_meta->get_last_log_idx()) { LOG_INFO(log, "A snapshot with a larger last log index ({}) was created, skipping applying this snapshot", latest_snapshot_meta->get_last_log_idx()); + return true; } latest_snapshot_ptr = latest_snapshot_buf; @@ -372,6 +373,10 @@ bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s) std::lock_guard lock(storage_and_responses_lock); auto snapshot_deserialization_result = snapshot_manager.deserializeSnapshotFromBuffer(snapshot_manager.deserializeSnapshotBufferFromDisk(s.get_last_log_idx())); + + /// maybe some logs were preprocessed with log idx larger than the snapshot idx + /// we have to apply them to the new storage + storage->applyUncommittedState(*snapshot_deserialization_result.storage, s.get_last_log_idx()); storage = std::move(snapshot_deserialization_result.storage); latest_snapshot_meta = snapshot_deserialization_result.snapshot_meta; cluster_config = snapshot_deserialization_result.cluster_config; diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 7a1a5e42632..66d6b0f5843 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -375,23 +375,26 @@ void KeeperStorage::UncommittedState::applyDelta(const Delta & delta) delta.operation); } +void KeeperStorage::UncommittedState::addDelta(Delta new_delta) +{ + const auto & added_delta = deltas.emplace_back(std::move(new_delta)); + + if (!added_delta.path.empty()) + { + deltas_for_path[added_delta.path].push_back(&added_delta); + applyDelta(added_delta); + } + else if (const auto * auth_delta = std::get_if(&added_delta.operation)) + { + auto & uncommitted_auth = session_and_auth[auth_delta->session_id]; + uncommitted_auth.emplace_back(&auth_delta->auth_id); + } +} + void KeeperStorage::UncommittedState::addDeltas(std::vector new_deltas) { for (auto & delta : new_deltas) - { - const auto & added_delta = deltas.emplace_back(std::move(delta)); - - if (!added_delta.path.empty()) - { - deltas_for_path[added_delta.path].push_back(&added_delta); - applyDelta(added_delta); - } - else if (const auto * auth_delta = std::get_if(&added_delta.operation)) - { - auto & uncommitted_auth = session_and_auth[auth_delta->session_id]; - uncommitted_auth.emplace_back(&auth_delta->auth_id); - } - } + addDelta(std::move(delta)); } void KeeperStorage::UncommittedState::commit(int64_t commit_zxid) @@ -602,6 +605,26 @@ namespace } +void KeeperStorage::applyUncommittedState(KeeperStorage & other, int64_t last_zxid) +{ + for (const auto & transaction : uncommitted_transactions) + { + if (transaction.zxid <= last_zxid) + continue; + other.uncommitted_transactions.push_back(transaction); + } + + auto it = uncommitted_state.deltas.begin(); + + for (; it != uncommitted_state.deltas.end(); ++it) + { + if (it->zxid <= last_zxid) + continue; + + other.uncommitted_state.addDelta(*it); + } +} + Coordination::Error KeeperStorage::commit(int64_t commit_zxid) { // Deltas are added with increasing ZXIDs diff --git a/src/Coordination/KeeperStorage.h b/src/Coordination/KeeperStorage.h index 7eb10be3847..844cbf85c1e 100644 --- a/src/Coordination/KeeperStorage.h +++ b/src/Coordination/KeeperStorage.h @@ -222,6 +222,7 @@ public: { explicit UncommittedState(KeeperStorage & storage_) : storage(storage_) { } + void addDelta(Delta new_delta); void addDeltas(std::vector new_deltas); void commit(int64_t commit_zxid); void rollback(int64_t rollback_zxid); @@ -310,6 +311,10 @@ public: UncommittedState uncommitted_state{*this}; + // Apply uncommitted state to another storage using only transactions + // with zxid > last_zxid + void applyUncommittedState(KeeperStorage & other, int64_t last_zxid); + Coordination::Error commit(int64_t zxid); // Create node in the storage diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index 62217fb2dd3..453fd0f2e60 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -2524,6 +2524,83 @@ TEST_P(CoordinationTest, TestCheckNotExistsRequest) } } +TEST_P(CoordinationTest, TestReapplyingDeltas) +{ + using namespace DB; + using namespace Coordination; + + static constexpr int64_t initial_zxid = 100; + + const auto create_request = std::make_shared(); + create_request->path = "/test/data"; + create_request->is_sequential = true; + + const auto process_create = [](KeeperStorage & storage, const auto & request, int64_t zxid) + { + storage.preprocessRequest(request, 1, 0, zxid); + auto responses = storage.processRequest(request, 1, zxid); + EXPECT_GE(responses.size(), 1); + EXPECT_EQ(responses[0].response->error, Error::ZOK); + }; + + const auto commit_initial_data = [&](auto & storage) + { + int64_t zxid = 1; + + const auto root_create = std::make_shared(); + root_create->path = "/test"; + process_create(storage, root_create, zxid); + ++zxid; + + for (; zxid <= initial_zxid; ++zxid) + process_create(storage, create_request, zxid); + }; + + KeeperStorage storage1{500, "", keeper_context}; + commit_initial_data(storage1); + + for (int64_t zxid = initial_zxid + 1; zxid < initial_zxid + 50; ++zxid) + storage1.preprocessRequest(create_request, 1, 0, zxid); + + /// create identical new storage + KeeperStorage storage2{500, "", keeper_context}; + commit_initial_data(storage2); + + storage1.applyUncommittedState(storage2, initial_zxid); + + const auto commit_unprocessed = [&](KeeperStorage & storage) + { + for (int64_t zxid = initial_zxid + 1; zxid < initial_zxid + 50; ++zxid) + { + auto responses = storage.processRequest(create_request, 1, zxid); + EXPECT_GE(responses.size(), 1); + EXPECT_EQ(responses[0].response->error, Error::ZOK); + } + }; + + commit_unprocessed(storage1); + commit_unprocessed(storage2); + + const auto get_children = [&](KeeperStorage & storage) + { + const auto list_request = std::make_shared(); + list_request->path = "/test"; + auto responses = storage.processRequest(list_request, 1, std::nullopt, /*check_acl=*/true, /*is_local=*/true); + EXPECT_EQ(responses.size(), 1); + const auto * list_response = dynamic_cast(responses[0].response.get()); + EXPECT_TRUE(list_response); + return list_response->names; + }; + + auto children1 = get_children(storage1); + std::unordered_set children1_set(children1.begin(), children1.end()); + + auto children2 = get_children(storage2); + std::unordered_set children2_set(children2.begin(), children2.end()); + + ASSERT_TRUE(children1_set == children2_set); +} + INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite, CoordinationTest, ::testing::ValuesIn(std::initializer_list{ From 2a4f1c82756b8937a2f87453f1bd76bc09a45114 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 2 Jun 2023 14:00:57 +0000 Subject: [PATCH 0318/1072] Fix typos --- docs/en/sql-reference/functions/other-functions.md | 6 +----- docs/en/sql-reference/functions/random-functions.md | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 6d49527fd0d..254297451f7 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2006,11 +2006,7 @@ isDecimalOverflow(d, [p]) **Arguments** - `d` — value. [Decimal](../../sql-reference/data-types/decimal.md). -<<<<<<< HEAD -- `p` — precision. Optional. If omitted, the initial precision of the first argument is used. This paratemer can be helpful to migrate data from/to another database or file. [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). -======= -- `p` — precision. Optional. If omitted, the initial precision of the first argument is used. Using of this parameter could be helpful for data extraction to another DBMS or file. [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). ->>>>>>> rschu1ze/master +- `p` — precision. Optional. If omitted, the initial precision of the first argument is used. This parameter can be helpful to migrate data from/to another database or file. [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). **Returned values** diff --git a/docs/en/sql-reference/functions/random-functions.md b/docs/en/sql-reference/functions/random-functions.md index 63d5174b494..e593d9458f0 100644 --- a/docs/en/sql-reference/functions/random-functions.md +++ b/docs/en/sql-reference/functions/random-functions.md @@ -67,7 +67,7 @@ randUniform(min, max) **Arguments** - `min` - `Float64` - left boundary of the range, -- `max` - `Float64` - reight boundary of the range. +- `max` - `Float64` - right boundary of the range. **Returned value** From 5e17adc9c0cab1ac911ccf6c7ad3cb1f8d8c7447 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Fri, 2 Jun 2023 14:08:14 +0000 Subject: [PATCH 0319/1072] Add `system.user_processes` table --- .../System/StorageSystemUserProcesses.cpp | 60 +++++++++++++++++++ .../System/StorageSystemUserProcesses.h | 29 +++++++++ src/Storages/System/attachSystemTables.cpp | 2 + 3 files changed, 91 insertions(+) create mode 100644 src/Storages/System/StorageSystemUserProcesses.cpp create mode 100644 src/Storages/System/StorageSystemUserProcesses.h diff --git a/src/Storages/System/StorageSystemUserProcesses.cpp b/src/Storages/System/StorageSystemUserProcesses.cpp new file mode 100644 index 00000000000..5973f9e2af3 --- /dev/null +++ b/src/Storages/System/StorageSystemUserProcesses.cpp @@ -0,0 +1,60 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +NamesAndTypesList StorageSystemUserProcesses::getNamesAndTypes() +{ + return { + {"user", std::make_shared()}, + {"memory_usage", std::make_shared()}, + {"peak_memory_usage", std::make_shared()}, + {"ProfileEvents", std::make_shared(std::make_shared(), std::make_shared())}, + }; +} + +NamesAndAliases StorageSystemUserProcesses::getNamesAndAliases() +{ + return { + {"ProfileEvents.Names", {std::make_shared(std::make_shared())}, "mapKeys(ProfileEvents)"}, + {"ProfileEvents.Values", {std::make_shared(std::make_shared())}, "mapValues(ProfileEvents)"}}; +} + +void StorageSystemUserProcesses::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const +{ + const auto user_info = context->getProcessList().getUserInfo(true); + + for (const auto & [user, info] : user_info) + { + size_t i = 0; + + res_columns[i++]->insert(user); + res_columns[i++]->insert(info.memory_usage); + res_columns[i++]->insert(info.peak_memory_usage); + { + IColumn * column = res_columns[i++].get(); + + if (info.profile_counters) + ProfileEvents::dumpToMapColumn(*info.profile_counters, column, true); + else + { + column->insertDefault(); + } + } + } +} +} diff --git a/src/Storages/System/StorageSystemUserProcesses.h b/src/Storages/System/StorageSystemUserProcesses.h new file mode 100644 index 00000000000..9bdc009d849 --- /dev/null +++ b/src/Storages/System/StorageSystemUserProcesses.h @@ -0,0 +1,29 @@ +#pragma once + +#include + + +namespace DB +{ + +class Context; + + +/** Implements `processes` system table, which allows you to get information about the queries that are currently executing. + */ +class StorageSystemUserProcesses final : public IStorageSystemOneBlock +{ +public: + std::string getName() const override { return "SystemUserProcesses"; } + + static NamesAndTypesList getNamesAndTypes(); + + static NamesAndAliases getNamesAndAliases(); + +protected: + using IStorageSystemOneBlock::IStorageSystemOneBlock; + + void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo & query_info) const override; +}; + +} diff --git a/src/Storages/System/attachSystemTables.cpp b/src/Storages/System/attachSystemTables.cpp index 424c74662ec..7d21d9e39d2 100644 --- a/src/Storages/System/attachSystemTables.cpp +++ b/src/Storages/System/attachSystemTables.cpp @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -185,6 +186,7 @@ void attachSystemTablesServer(ContextPtr context, IDatabase & system_database, b attach(context, system_database, "remote_data_paths"); attach(context, system_database, "certificates"); attach(context, system_database, "named_collections"); + attach(context, system_database, "user_processes"); if (has_zookeeper) { From 17cca6ed756eaaa58eae7ef6aa89e43dcda8ce24 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Fri, 2 Jun 2023 10:08:48 -0400 Subject: [PATCH 0320/1072] add direct join docs --- .../integrations/embedded-rocksdb.md | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md index a3604b3c332..dab741a9f63 100644 --- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md +++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md @@ -120,3 +120,88 @@ Values can be updated using the `ALTER TABLE` query. The primary key cannot be u ```sql ALTER TABLE test UPDATE v1 = v1 * 10 + 2 WHERE key LIKE 'some%' AND v3 > 3.1; ``` + +### Joins + +A special `direct` join with EmbeddedRocksDB tables is supported. +This direct join avoids forming a hash table in memory and accesses +the data directly from the EmbeddedRocksDB. + +To enable direct joins: +```sql +SET join_algorithm = 'direct' +``` + +:::tip +When the `join_algorithm` is set to `direct`, direct joins will be used +when possible. However, direct joins are not used for RIGHT or FULL JOINs. +ClickHouse will choose another join algorithm when direct joins are not possible. +::: + +#### Example + +##### Create and populate an EmbeddedRocksDB table: +```sql +CREATE TABLE rdb +( + `key` UInt32, + `value` Array(UInt32), + `value2` String +) +ENGINE = EmbeddedRocksDB +PRIMARY KEY key +``` + +```sql +INSERT INTO rdb + SELECT + toUInt32(sipHash64(number) % 10) as key, + [key, key+1] as value, + ('val2' || toString(key)) as value2 + FROM numbers_mt(10); +``` + +##### Create and populate a table to join with table `rdb`: + +```sql +CREATE TABLE t2 +( + `k` UInt16 +) +ENGINE = TinyLog +``` + +```sql +INSERT INTO t2 SELECT number AS k +FROM numbers_mt(10) +``` + +##### Set the join algorithm to `direct`: + +```sql +SET join_algorithm = 'direct' +``` + +##### An INNER JOIN: +```sql +SELECT * +FROM +( + SELECT k AS key + FROM t2 +) AS t2 +INNER JOIN rdb ON rdb.key = t2.key +ORDER BY key ASC +``` +```response +┌─key─┬─rdb.key─┬─value──┬─value2─┐ +│ 0 │ 0 │ [0,1] │ val20 │ +│ 2 │ 2 │ [2,3] │ val22 │ +│ 3 │ 3 │ [3,4] │ val23 │ +│ 6 │ 6 │ [6,7] │ val26 │ +│ 7 │ 7 │ [7,8] │ val27 │ +│ 8 │ 8 │ [8,9] │ val28 │ +│ 9 │ 9 │ [9,10] │ val29 │ +└─────┴─────────┴────────┴────────┘ +``` + From d0da370d1e18c69e533f5d85725b7db4aa87d884 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Fri, 2 Jun 2023 16:19:24 +0200 Subject: [PATCH 0321/1072] Add a test for compound column identifier --- .../02771_resolve_compound_identifier.reference | 10 ++++++++++ .../0_stateless/02771_resolve_compound_identifier.sql | 11 +++++++++++ 2 files changed, 21 insertions(+) create mode 100644 tests/queries/0_stateless/02771_resolve_compound_identifier.reference create mode 100644 tests/queries/0_stateless/02771_resolve_compound_identifier.sql diff --git a/tests/queries/0_stateless/02771_resolve_compound_identifier.reference b/tests/queries/0_stateless/02771_resolve_compound_identifier.reference new file mode 100644 index 00000000000..8b1acc12b63 --- /dev/null +++ b/tests/queries/0_stateless/02771_resolve_compound_identifier.reference @@ -0,0 +1,10 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 diff --git a/tests/queries/0_stateless/02771_resolve_compound_identifier.sql b/tests/queries/0_stateless/02771_resolve_compound_identifier.sql new file mode 100644 index 00000000000..db4d443379e --- /dev/null +++ b/tests/queries/0_stateless/02771_resolve_compound_identifier.sql @@ -0,0 +1,11 @@ +DROP DATABASE IF EXISTS test_02771; + +CREATE DATABASE test_02771; + +CREATE TABLE test_02771.t (x UInt8) ENGINE = MergeTree() ORDER BY x; + +INSERT INTO test_02771.t SELECT number FROM numbers(10); + +SELECT t.x FROM test_02771.t ORDER BY t.x; + +DROP DATABASE IF EXISTS test_02771; From ae497d398a5bcb48fef1d6b90e24fb0818e2bd05 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 2 Jun 2023 14:23:04 +0000 Subject: [PATCH 0322/1072] Remove unneeded test --- .../__init__.py | 0 .../configs/enable_keeper1.xml | 41 -- .../configs/enable_keeper2.xml | 41 -- .../configs/enable_keeper3.xml | 41 -- .../configs/use_keeper.xml | 16 - .../test.py | 473 ------------------ 6 files changed, 612 deletions(-) delete mode 100644 tests/integration/test_keeper_multinode_blocade_leader/__init__.py delete mode 100644 tests/integration/test_keeper_multinode_blocade_leader/configs/enable_keeper1.xml delete mode 100644 tests/integration/test_keeper_multinode_blocade_leader/configs/enable_keeper2.xml delete mode 100644 tests/integration/test_keeper_multinode_blocade_leader/configs/enable_keeper3.xml delete mode 100644 tests/integration/test_keeper_multinode_blocade_leader/configs/use_keeper.xml delete mode 100644 tests/integration/test_keeper_multinode_blocade_leader/test.py diff --git a/tests/integration/test_keeper_multinode_blocade_leader/__init__.py b/tests/integration/test_keeper_multinode_blocade_leader/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/integration/test_keeper_multinode_blocade_leader/configs/enable_keeper1.xml b/tests/integration/test_keeper_multinode_blocade_leader/configs/enable_keeper1.xml deleted file mode 100644 index 17455ed12f5..00000000000 --- a/tests/integration/test_keeper_multinode_blocade_leader/configs/enable_keeper1.xml +++ /dev/null @@ -1,41 +0,0 @@ - - - 9181 - 1 - /var/lib/clickhouse/coordination/log - /var/lib/clickhouse/coordination/snapshots - - - 5000 - 10000 - 75 - trace - - - - - 1 - node1 - 9234 - true - 3 - - - 2 - node2 - 9234 - true - true - 2 - - - 3 - node3 - 9234 - true - true - 1 - - - - diff --git a/tests/integration/test_keeper_multinode_blocade_leader/configs/enable_keeper2.xml b/tests/integration/test_keeper_multinode_blocade_leader/configs/enable_keeper2.xml deleted file mode 100644 index 03a23984cc2..00000000000 --- a/tests/integration/test_keeper_multinode_blocade_leader/configs/enable_keeper2.xml +++ /dev/null @@ -1,41 +0,0 @@ - - - 9181 - 2 - /var/lib/clickhouse/coordination/log - /var/lib/clickhouse/coordination/snapshots - - - 5000 - 10000 - 75 - trace - - - - - 1 - node1 - 9234 - true - 3 - - - 2 - node2 - 9234 - true - true - 2 - - - 3 - node3 - 9234 - true - true - 1 - - - - diff --git a/tests/integration/test_keeper_multinode_blocade_leader/configs/enable_keeper3.xml b/tests/integration/test_keeper_multinode_blocade_leader/configs/enable_keeper3.xml deleted file mode 100644 index a3196ac3061..00000000000 --- a/tests/integration/test_keeper_multinode_blocade_leader/configs/enable_keeper3.xml +++ /dev/null @@ -1,41 +0,0 @@ - - - 9181 - 3 - /var/lib/clickhouse/coordination/log - /var/lib/clickhouse/coordination/snapshots - - - 5000 - 10000 - 75 - trace - - - - - 1 - node1 - 9234 - true - 3 - - - 2 - node2 - 9234 - true - true - 2 - - - 3 - node3 - 9234 - true - true - 1 - - - - diff --git a/tests/integration/test_keeper_multinode_blocade_leader/configs/use_keeper.xml b/tests/integration/test_keeper_multinode_blocade_leader/configs/use_keeper.xml deleted file mode 100644 index 384e984f210..00000000000 --- a/tests/integration/test_keeper_multinode_blocade_leader/configs/use_keeper.xml +++ /dev/null @@ -1,16 +0,0 @@ - - - - node1 - 9181 - - - node2 - 9181 - - - node3 - 9181 - - - diff --git a/tests/integration/test_keeper_multinode_blocade_leader/test.py b/tests/integration/test_keeper_multinode_blocade_leader/test.py deleted file mode 100644 index 3af0751b0fd..00000000000 --- a/tests/integration/test_keeper_multinode_blocade_leader/test.py +++ /dev/null @@ -1,473 +0,0 @@ -import pytest -from helpers.cluster import ClickHouseCluster -import helpers.keeper_utils as keeper_utils -import random -import string -import os -import time -from multiprocessing.dummy import Pool -from helpers.network import PartitionManager -from helpers.test_tools import assert_eq_with_retry - -cluster = ClickHouseCluster(__file__) -node1 = cluster.add_instance( - "node1", - main_configs=["configs/enable_keeper1.xml", "configs/use_keeper.xml"], - stay_alive=True, -) -node2 = cluster.add_instance( - "node2", - main_configs=["configs/enable_keeper2.xml", "configs/use_keeper.xml"], - stay_alive=True, -) -node3 = cluster.add_instance( - "node3", - main_configs=["configs/enable_keeper3.xml", "configs/use_keeper.xml"], - stay_alive=True, -) - -from kazoo.client import KazooClient, KazooState - -""" -In this test, we blockade RAFT leader and check that the whole system is -able to recover. It's not a good test because we use ClickHouse's replicated -tables to check connectivity, but they may require special operations (or a long -wait) after session expiration. We don't use kazoo, because this client pretends -to be very smart: SUSPEND sessions, try to recover them, and so on. The test -will be even less predictable than with ClickHouse tables. - -TODO find (or write) not so smart python client. -TODO remove this when jepsen tests will be written. -""" - - -@pytest.fixture(scope="module") -def started_cluster(): - try: - cluster.start() - - yield cluster - - finally: - cluster.shutdown() - - -def smaller_exception(ex): - return "\n".join(str(ex).split("\n")[0:2]) - - -def get_fake_zk(nodename, timeout=30.0): - _fake_zk_instance = KazooClient( - hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout - ) - _fake_zk_instance.start() - return _fake_zk_instance - - -def wait_nodes(): - keeper_utils.wait_nodes(cluster, [node1, node2, node3]) - - -# in extremely rare case it can take more than 5 minutes in debug build with sanitizer -@pytest.mark.timeout(600) -def test_blocade_leader(started_cluster): - for i in range(100): - wait_nodes() - try: - for i, node in enumerate([node1, node2, node3]): - node.query( - "CREATE DATABASE IF NOT EXISTS ordinary ENGINE=Ordinary", - settings={"allow_deprecated_database_ordinary": 1}, - ) - node.query( - "CREATE TABLE IF NOT EXISTS ordinary.t1 (value UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/t1', '{}') ORDER BY tuple()".format( - i + 1 - ) - ) - break - except Exception as ex: - print("Got exception from node", smaller_exception(ex)) - time.sleep(0.1) - - node2.query( - "INSERT INTO ordinary.t1 SELECT number FROM numbers(10) SETTINGS insert_keeper_max_retries = 0" - ) - - node1.query("SYSTEM SYNC REPLICA ordinary.t1", timeout=10) - node3.query("SYSTEM SYNC REPLICA ordinary.t1", timeout=10) - - assert_eq_with_retry(node1, "SELECT COUNT() FROM ordinary.t1", "10") - assert_eq_with_retry(node2, "SELECT COUNT() FROM ordinary.t1", "10") - assert_eq_with_retry(node3, "SELECT COUNT() FROM ordinary.t1", "10") - - with PartitionManager() as pm: - pm.partition_instances(node2, node1) - pm.partition_instances(node3, node1) - - for i in range(100): - try: - restart_replica_for_sure( - node2, "ordinary.t1", "/clickhouse/t1/replicas/2" - ) - node2.query( - "INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100) SETTINGS insert_keeper_max_retries = 0" - ) - break - except Exception as ex: - try: - node2.query("ATTACH TABLE ordinary.t1") - except Exception as attach_ex: - print("Got exception node2", smaller_exception(attach_ex)) - print("Got exception node2", smaller_exception(ex)) - time.sleep(0.5) - else: - for num, node in enumerate([node1, node2, node3]): - dump_zk( - node, "/clickhouse/t1", "/clickhouse/t1/replicas/{}".format(num + 1) - ) - assert False, "Cannot insert anything node2" - - for i in range(100): - try: - restart_replica_for_sure( - node3, "ordinary.t1", "/clickhouse/t1/replicas/3" - ) - node3.query( - "INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100) SETTINGS insert_keeper_max_retries = 0" - ) - break - except Exception as ex: - try: - node3.query("ATTACH TABLE ordinary.t1") - except Exception as attach_ex: - print("Got exception node3", smaller_exception(attach_ex)) - print("Got exception node3", smaller_exception(ex)) - time.sleep(0.5) - else: - for num, node in enumerate([node1, node2, node3]): - dump_zk( - node, "/clickhouse/t1", "/clickhouse/t1/replicas/{}".format(num + 1) - ) - assert False, "Cannot insert anything node3" - - for n, node in enumerate([node1, node2, node3]): - for i in range(100): - try: - restart_replica_for_sure( - node, "ordinary.t1", "/clickhouse/t1/replicas/{}".format(n + 1) - ) - break - except Exception as ex: - try: - node.query("ATTACH TABLE ordinary.t1") - except Exception as attach_ex: - print( - "Got exception node{}".format(n + 1), - smaller_exception(attach_ex), - ) - - print("Got exception node{}".format(n + 1), smaller_exception(ex)) - time.sleep(0.5) - else: - assert False, "Cannot reconnect for node{}".format(n + 1) - - for i in range(100): - try: - node1.query( - "INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100) SETTINGS insert_keeper_max_retries = 0" - ) - break - except Exception as ex: - print("Got exception node1", smaller_exception(ex)) - time.sleep(0.5) - else: - for num, node in enumerate([node1, node2, node3]): - dump_zk( - node, "/clickhouse/t1", "/clickhouse/t1/replicas/{}".format(num + 1) - ) - assert False, "Cannot insert anything node1" - - for n, node in enumerate([node1, node2, node3]): - for i in range(100): - try: - restart_replica_for_sure( - node, "ordinary.t1", "/clickhouse/t1/replicas/{}".format(n + 1) - ) - node.query("SYSTEM SYNC REPLICA ordinary.t1", timeout=10) - break - except Exception as ex: - try: - node.query("ATTACH TABLE ordinary.t1") - except Exception as attach_ex: - print( - "Got exception node{}".format(n + 1), - smaller_exception(attach_ex), - ) - - print("Got exception node{}".format(n + 1), smaller_exception(ex)) - time.sleep(0.5) - else: - for num, node in enumerate([node1, node2, node3]): - dump_zk( - node, "/clickhouse/t1", "/clickhouse/t1/replicas/{}".format(num + 1) - ) - assert False, "Cannot sync replica node{}".format(n + 1) - - if node1.query("SELECT COUNT() FROM ordinary.t1") != "310\n": - for num, node in enumerate([node1, node2, node3]): - dump_zk( - node, "/clickhouse/t1", "/clickhouse/t1/replicas/{}".format(num + 1) - ) - - assert_eq_with_retry(node1, "SELECT COUNT() FROM ordinary.t1", "310") - assert_eq_with_retry(node2, "SELECT COUNT() FROM ordinary.t1", "310") - assert_eq_with_retry(node3, "SELECT COUNT() FROM ordinary.t1", "310") - - -def dump_zk(node, zk_path, replica_path): - print(node.query("SELECT * FROM system.replication_queue FORMAT Vertical")) - print("Replicas") - print(node.query("SELECT * FROM system.replicas FORMAT Vertical")) - print("Replica 2 info") - print( - node.query( - "SELECT * FROM system.zookeeper WHERE path = '{}' FORMAT Vertical".format( - zk_path - ) - ) - ) - print("Queue") - print( - node.query( - "SELECT * FROM system.zookeeper WHERE path = '{}/queue' FORMAT Vertical".format( - replica_path - ) - ) - ) - print("Log") - print( - node.query( - "SELECT * FROM system.zookeeper WHERE path = '{}/log' FORMAT Vertical".format( - zk_path - ) - ) - ) - print("Parts") - print( - node.query( - "SELECT name FROM system.zookeeper WHERE path = '{}/parts' FORMAT Vertical".format( - replica_path - ) - ) - ) - - -def restart_replica_for_sure(node, table_name, zk_replica_path): - fake_zk = None - try: - node.query("DETACH TABLE {}".format(table_name)) - fake_zk = get_fake_zk(node.name) - if fake_zk.exists(zk_replica_path + "/is_active") is not None: - fake_zk.delete(zk_replica_path + "/is_active") - - node.query("ATTACH TABLE {}".format(table_name)) - except Exception as ex: - print("Exception", ex) - raise ex - finally: - if fake_zk: - fake_zk.stop() - fake_zk.close() - - -# in extremely rare case it can take more than 5 minutes in debug build with sanitizer -@pytest.mark.timeout(600) -def test_blocade_leader_twice(started_cluster): - for i in range(100): - wait_nodes() - try: - for i, node in enumerate([node1, node2, node3]): - node.query( - "CREATE DATABASE IF NOT EXISTS ordinary ENGINE=Ordinary", - settings={"allow_deprecated_database_ordinary": 1}, - ) - node.query( - "CREATE TABLE IF NOT EXISTS ordinary.t2 (value UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/t2', '{}') ORDER BY tuple()".format( - i + 1 - ) - ) - break - except Exception as ex: - print("Got exception from node", smaller_exception(ex)) - time.sleep(0.1) - - node2.query( - "INSERT INTO ordinary.t2 SELECT number FROM numbers(10) SETTINGS insert_keeper_max_retries = 0" - ) - - node1.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10) - node3.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10) - - assert_eq_with_retry(node1, "SELECT COUNT() FROM ordinary.t2", "10") - assert_eq_with_retry(node2, "SELECT COUNT() FROM ordinary.t2", "10") - assert_eq_with_retry(node3, "SELECT COUNT() FROM ordinary.t2", "10") - - with PartitionManager() as pm: - pm.partition_instances(node2, node1) - pm.partition_instances(node3, node1) - - for i in range(100): - try: - restart_replica_for_sure( - node2, "ordinary.t2", "/clickhouse/t2/replicas/2" - ) - node2.query( - "INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100) SETTINGS insert_keeper_max_retries = 0" - ) - break - except Exception as ex: - try: - node2.query("ATTACH TABLE ordinary.t2") - except Exception as attach_ex: - print("Got exception node2", smaller_exception(attach_ex)) - print("Got exception node2", smaller_exception(ex)) - time.sleep(0.5) - else: - for num, node in enumerate([node1, node2, node3]): - dump_zk( - node, "/clickhouse/t2", "/clickhouse/t2/replicas/{}".format(num + 1) - ) - assert False, "Cannot reconnect for node2" - - for i in range(100): - try: - restart_replica_for_sure( - node3, "ordinary.t2", "/clickhouse/t2/replicas/3" - ) - node3.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10) - node3.query( - "INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100) SETTINGS insert_keeper_max_retries = 0" - ) - break - except Exception as ex: - try: - node3.query("ATTACH TABLE ordinary.t2") - except Exception as attach_ex: - print("Got exception node3", smaller_exception(attach_ex)) - print("Got exception node3", smaller_exception(ex)) - time.sleep(0.5) - else: - for num, node in enumerate([node1, node2, node3]): - dump_zk( - node, "/clickhouse/t2", "/clickhouse/t2/replicas/{}".format(num + 1) - ) - assert False, "Cannot reconnect for node3" - - node2.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10) - - assert_eq_with_retry(node2, "SELECT COUNT() FROM ordinary.t2", "210") - assert_eq_with_retry(node3, "SELECT COUNT() FROM ordinary.t2", "210") - - # Total network partition - pm.partition_instances(node3, node2) - - for i in range(10): - try: - node3.query( - "INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100) SETTINGS insert_keeper_max_retries = 0" - ) - assert False, "Node3 became leader?" - except Exception as ex: - time.sleep(0.5) - - for i in range(10): - try: - node2.query( - "INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100) SETTINGS insert_keeper_max_retries = 0" - ) - assert False, "Node2 became leader?" - except Exception as ex: - time.sleep(0.5) - - for n, node in enumerate([node1, node2, node3]): - for i in range(100): - try: - restart_replica_for_sure( - node, "ordinary.t2", "/clickhouse/t2/replicas/{}".format(n + 1) - ) - break - except Exception as ex: - try: - node.query("ATTACH TABLE ordinary.t2") - except Exception as attach_ex: - print( - "Got exception node{}".format(n + 1), - smaller_exception(attach_ex), - ) - - print("Got exception node{}".format(n + 1), smaller_exception(ex)) - time.sleep(0.5) - else: - for num, node in enumerate([node1, node2, node3]): - dump_zk( - node, "/clickhouse/t2", "/clickhouse/t2/replicas/{}".format(num + 1) - ) - assert False, "Cannot reconnect for node{}".format(n + 1) - - for n, node in enumerate([node1, node2, node3]): - for i in range(100): - try: - node.query( - "INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100) SETTINGS insert_keeper_max_retries = 0" - ) - break - except Exception as ex: - print("Got exception node{}".format(n + 1), smaller_exception(ex)) - time.sleep(0.5) - else: - for num, node in enumerate([node1, node2, node3]): - dump_zk( - node, "/clickhouse/t2", "/clickhouse/t2/replicas/{}".format(num + 1) - ) - assert False, "Cannot reconnect for node{}".format(n + 1) - - for i in range(100): - all_done = True - for n, node in enumerate([node1, node2, node3]): - try: - restart_replica_for_sure( - node, "ordinary.t2", "/clickhouse/t2/replicas/{}".format(n + 1) - ) - node.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10) - break - except Exception as ex: - all_done = False - try: - node.query("ATTACH TABLE ordinary.t2") - except Exception as attach_ex: - print( - "Got exception node{}".format(n + 1), - smaller_exception(attach_ex), - ) - - print("Got exception node{}".format(n + 1), smaller_exception(ex)) - time.sleep(0.5) - - if all_done: - break - else: - for num, node in enumerate([node1, node2, node3]): - dump_zk( - node, "/clickhouse/t2", "/clickhouse/t2/replicas/{}".format(num + 1) - ) - assert False, "Cannot reconnect in i {} retries".format(i) - - assert_eq_with_retry(node1, "SELECT COUNT() FROM ordinary.t2", "510") - if node2.query("SELECT COUNT() FROM ordinary.t2") != "510\n": - for num, node in enumerate([node1, node2, node3]): - dump_zk( - node, "/clickhouse/t2", "/clickhouse/t2/replicas/{}".format(num + 1) - ) - - assert_eq_with_retry(node2, "SELECT COUNT() FROM ordinary.t2", "510") - assert_eq_with_retry(node3, "SELECT COUNT() FROM ordinary.t2", "510") From dcd5579851243a38ba9062636b72c2b60ebfadd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 2 Jun 2023 16:51:04 +0200 Subject: [PATCH 0323/1072] Mention ON CLUSTER options for SYSTEM queries --- docs/en/sql-reference/statements/system.md | 54 ++++++++++++++-------- docs/ru/sql-reference/statements/system.md | 48 +++++++++++-------- docs/zh/sql-reference/statements/system.md | 40 +++++++++------- 3 files changed, 87 insertions(+), 55 deletions(-) diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index c5596b7ba5f..65a35f03fbe 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -119,19 +119,35 @@ The compiled expression cache is enabled/disabled with the query/user/profile-le Resets the [query cache](../../operations/query-cache.md). +```sql +SYSTEM DROP QUERY CACHE [ON CLUSTER cluster_name] +``` + ## FLUSH LOGS Flushes buffered log messages to system tables, e.g. system.query_log. Mainly useful for debugging since most system tables have a default flush interval of 7.5 seconds. This will also create system tables even if message queue is empty. +```sql +SYSTEM FLUSH LOGS [ON CLUSTER cluster_name] +``` + ## RELOAD CONFIG Reloads ClickHouse configuration. Used when configuration is stored in ZooKeeper. Note that `SYSTEM RELOAD CONFIG` does not reload `USER` configuration stored in ZooKeeper, it only reloads `USER` configuration that is stored in `users.xml`. To reload all `USER` config use `SYSTEM RELOAD USERS` +```sql +SYSTEM RELOAD CONFIG [ON CLUSTER cluster_name] +``` + ## RELOAD USERS Reloads all access storages, including: users.xml, local disk access storage, replicated (in ZooKeeper) access storage. +```sql +SYSTEM RELOAD USERS [ON CLUSTER cluster_name] +``` + ## SHUTDOWN Normally shuts down ClickHouse (like `service clickhouse-server stop` / `kill {$pid_clickhouse-server}`) @@ -149,7 +165,7 @@ ClickHouse can manage [distributed](../../engines/table-engines/special/distribu Disables background data distribution when inserting data into distributed tables. ``` sql -SYSTEM STOP DISTRIBUTED SENDS [db.] +SYSTEM STOP DISTRIBUTED SENDS [db.] [ON CLUSTER cluster_name] ``` ### FLUSH DISTRIBUTED @@ -157,7 +173,7 @@ SYSTEM STOP DISTRIBUTED SENDS [db.] Forces ClickHouse to send data to cluster nodes synchronously. If any nodes are unavailable, ClickHouse throws an exception and stops query execution. You can retry the query until it succeeds, which will happen when all nodes are back online. ``` sql -SYSTEM FLUSH DISTRIBUTED [db.] +SYSTEM FLUSH DISTRIBUTED [db.] [ON CLUSTER cluster_name] ``` ### START DISTRIBUTED SENDS @@ -165,7 +181,7 @@ SYSTEM FLUSH DISTRIBUTED [db.] Enables background data distribution when inserting data into distributed tables. ``` sql -SYSTEM START DISTRIBUTED SENDS [db.] +SYSTEM START DISTRIBUTED SENDS [db.] [ON CLUSTER cluster_name] ``` ## Managing MergeTree Tables @@ -177,7 +193,7 @@ ClickHouse can manage background processes in [MergeTree](../../engines/table-en Provides possibility to stop background merges for tables in the MergeTree family: ``` sql -SYSTEM STOP MERGES [ON VOLUME | [db.]merge_tree_family_table_name] +SYSTEM STOP MERGES [ON CLUSTER cluster_name] [ON VOLUME | [db.]merge_tree_family_table_name] ``` :::note @@ -189,7 +205,7 @@ SYSTEM STOP MERGES [ON VOLUME | [db.]merge_tree_family_table_name] Provides possibility to start background merges for tables in the MergeTree family: ``` sql -SYSTEM START MERGES [ON VOLUME | [db.]merge_tree_family_table_name] +SYSTEM START MERGES [ON CLUSTER cluster_name] [ON VOLUME | [db.]merge_tree_family_table_name] ``` ### STOP TTL MERGES @@ -198,7 +214,7 @@ Provides possibility to stop background delete old data according to [TTL expres Returns `Ok.` even if table does not exist or table has not MergeTree engine. Returns error when database does not exist: ``` sql -SYSTEM STOP TTL MERGES [[db.]merge_tree_family_table_name] +SYSTEM STOP TTL MERGES [ON CLUSTER cluster_name] [[db.]merge_tree_family_table_name] ``` ### START TTL MERGES @@ -207,7 +223,7 @@ Provides possibility to start background delete old data according to [TTL expre Returns `Ok.` even if table does not exist. Returns error when database does not exist: ``` sql -SYSTEM START TTL MERGES [[db.]merge_tree_family_table_name] +SYSTEM START TTL MERGES [ON CLUSTER cluster_name] [[db.]merge_tree_family_table_name] ``` ### STOP MOVES @@ -216,7 +232,7 @@ Provides possibility to stop background move data according to [TTL table expres Returns `Ok.` even if table does not exist. Returns error when database does not exist: ``` sql -SYSTEM STOP MOVES [[db.]merge_tree_family_table_name] +SYSTEM STOP MOVES [ON CLUSTER cluster_name] [[db.]merge_tree_family_table_name] ``` ### START MOVES @@ -225,7 +241,7 @@ Provides possibility to start background move data according to [TTL table expre Returns `Ok.` even if table does not exist. Returns error when database does not exist: ``` sql -SYSTEM START MOVES [[db.]merge_tree_family_table_name] +SYSTEM START MOVES [ON CLUSTER cluster_name] [[db.]merge_tree_family_table_name] ``` ### SYSTEM UNFREEZE {#query_language-system-unfreeze} @@ -241,7 +257,7 @@ SYSTEM UNFREEZE WITH NAME Wait until all asynchronously loading data parts of a table (outdated data parts) will became loaded. ``` sql -SYSTEM WAIT LOADING PARTS [db.]merge_tree_family_table_name +SYSTEM WAIT LOADING PARTS [ON CLUSTER cluster_name] [db.]merge_tree_family_table_name ``` ## Managing ReplicatedMergeTree Tables @@ -254,7 +270,7 @@ Provides possibility to stop background fetches for inserted parts for tables in Always returns `Ok.` regardless of the table engine and even if table or database does not exist. ``` sql -SYSTEM STOP FETCHES [[db.]replicated_merge_tree_family_table_name] +SYSTEM STOP FETCHES [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### START FETCHES @@ -263,7 +279,7 @@ Provides possibility to start background fetches for inserted parts for tables i Always returns `Ok.` regardless of the table engine and even if table or database does not exist. ``` sql -SYSTEM START FETCHES [[db.]replicated_merge_tree_family_table_name] +SYSTEM START FETCHES [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### STOP REPLICATED SENDS @@ -271,7 +287,7 @@ SYSTEM START FETCHES [[db.]replicated_merge_tree_family_table_name] Provides possibility to stop background sends to other replicas in cluster for new inserted parts for tables in the `ReplicatedMergeTree` family: ``` sql -SYSTEM STOP REPLICATED SENDS [[db.]replicated_merge_tree_family_table_name] +SYSTEM STOP REPLICATED SENDS [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### START REPLICATED SENDS @@ -279,7 +295,7 @@ SYSTEM STOP REPLICATED SENDS [[db.]replicated_merge_tree_family_table_name] Provides possibility to start background sends to other replicas in cluster for new inserted parts for tables in the `ReplicatedMergeTree` family: ``` sql -SYSTEM START REPLICATED SENDS [[db.]replicated_merge_tree_family_table_name] +SYSTEM START REPLICATED SENDS [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### STOP REPLICATION QUEUES @@ -287,7 +303,7 @@ SYSTEM START REPLICATED SENDS [[db.]replicated_merge_tree_family_table_name] Provides possibility to stop background fetch tasks from replication queues which stored in Zookeeper for tables in the `ReplicatedMergeTree` family. Possible background tasks types - merges, fetches, mutation, DDL statements with ON CLUSTER clause: ``` sql -SYSTEM STOP REPLICATION QUEUES [[db.]replicated_merge_tree_family_table_name] +SYSTEM STOP REPLICATION QUEUES [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### START REPLICATION QUEUES @@ -295,7 +311,7 @@ SYSTEM STOP REPLICATION QUEUES [[db.]replicated_merge_tree_family_table_name] Provides possibility to start background fetch tasks from replication queues which stored in Zookeeper for tables in the `ReplicatedMergeTree` family. Possible background tasks types - merges, fetches, mutation, DDL statements with ON CLUSTER clause: ``` sql -SYSTEM START REPLICATION QUEUES [[db.]replicated_merge_tree_family_table_name] +SYSTEM START REPLICATION QUEUES [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### SYNC REPLICA @@ -318,7 +334,7 @@ Provides possibility to reinitialize Zookeeper session's state for `ReplicatedMe Initialization of replication queue based on ZooKeeper data happens in the same way as for `ATTACH TABLE` statement. For a short time, the table will be unavailable for any operations. ``` sql -SYSTEM RESTART REPLICA [db.]replicated_merge_tree_family_table_name +SYSTEM RESTART REPLICA [ON CLUSTER cluster_name] [db.]replicated_merge_tree_family_table_name ``` ### RESTORE REPLICA @@ -384,7 +400,7 @@ Provides possibility to reinitialize Zookeeper sessions state for all `Replicate Allows to drop filesystem cache. ```sql -SYSTEM DROP FILESYSTEM CACHE +SYSTEM DROP FILESYSTEM CACHE [ON CLUSTER cluster_name] ``` ### SYNC FILE CACHE @@ -396,5 +412,5 @@ It's too heavy and has potential for misuse. Will do sync syscall. ```sql -SYSTEM SYNC FILE CACHE +SYSTEM SYNC FILE CACHE [ON CLUSTER cluster_name] ``` diff --git a/docs/ru/sql-reference/statements/system.md b/docs/ru/sql-reference/statements/system.md index 22a74648eab..ec30a031643 100644 --- a/docs/ru/sql-reference/statements/system.md +++ b/docs/ru/sql-reference/statements/system.md @@ -39,7 +39,7 @@ SELECT name, status FROM system.dictionaries; **Синтаксис** ```sql -SYSTEM RELOAD MODELS +SYSTEM RELOAD MODELS [ON CLUSTER cluster_name] ``` ## RELOAD MODEL {#query_language-system-reload-model} @@ -49,7 +49,7 @@ SYSTEM RELOAD MODELS **Синтаксис** ```sql -SYSTEM RELOAD MODEL +SYSTEM RELOAD MODEL [ON CLUSTER cluster_name] ``` ## RELOAD FUNCTIONS {#query_language-system-reload-functions} @@ -59,8 +59,8 @@ SYSTEM RELOAD MODEL **Синтаксис** ```sql -RELOAD FUNCTIONS -RELOAD FUNCTION function_name +RELOAD FUNCTIONS [ON CLUSTER cluster_name] +RELOAD FUNCTION function_name [ON CLUSTER cluster_name] ``` ## DROP DNS CACHE {#query_language-system-drop-dns-cache} @@ -106,10 +106,18 @@ Cкомпилированные выражения используются ко Записывает буферы логов в системные таблицы (например system.query_log). Позволяет не ждать 7.5 секунд при отладке. Если буфер логов пустой, то этот запрос просто создаст системные таблицы. +```sql +SYSTEM FLUSH LOGS [ON CLUSTER cluster_name] +``` + ## RELOAD CONFIG {#query_language-system-reload-config} Перечитывает конфигурацию настроек ClickHouse. Используется при хранении конфигурации в zookeeper. +```sql +SYSTEM RELOAD CONFIG [ON CLUSTER cluster_name] +``` + ## SHUTDOWN {#query_language-system-shutdown} Штатно завершает работу ClickHouse (аналог `service clickhouse-server stop` / `kill {$pid_clickhouse-server}`) @@ -127,7 +135,7 @@ ClickHouse может оперировать [распределёнными](.. Отключает фоновую отправку при вставке данных в распределённые таблицы. ``` sql -SYSTEM STOP DISTRIBUTED SENDS [db.] +SYSTEM STOP DISTRIBUTED SENDS [db.] [ON CLUSTER cluster_name] ``` ### FLUSH DISTRIBUTED {#query_language-system-flush-distributed} @@ -135,7 +143,7 @@ SYSTEM STOP DISTRIBUTED SENDS [db.] В синхронном режиме отправляет все данные на узлы кластера. Если какие-либо узлы недоступны, ClickHouse генерирует исключение и останавливает выполнение запроса. Такой запрос можно повторять до успешного завершения, что будет означать возвращение связанности с остальными узлами кластера. ``` sql -SYSTEM FLUSH DISTRIBUTED [db.] +SYSTEM FLUSH DISTRIBUTED [db.] [ON CLUSTER cluster_name] ``` ### START DISTRIBUTED SENDS {#query_language-system-start-distributed-sends} @@ -143,7 +151,7 @@ SYSTEM FLUSH DISTRIBUTED [db.] Включает фоновую отправку при вставке данных в распределенные таблицы. ``` sql -SYSTEM START DISTRIBUTED SENDS [db.] +SYSTEM START DISTRIBUTED SENDS [db.] [ON CLUSTER cluster_name] ``` ## Managing MergeTree Tables {#query-language-system-mergetree} @@ -155,7 +163,7 @@ ClickHouse может управлять фоновыми процессами Позволяет остановить фоновые мержи для таблиц семейства MergeTree: ``` sql -SYSTEM STOP MERGES [ON VOLUME | [db.]merge_tree_family_table_name] +SYSTEM STOP MERGES [ON CLUSTER cluster_name] [ON VOLUME | [db.]merge_tree_family_table_name] ``` :::note @@ -166,7 +174,7 @@ SYSTEM STOP MERGES [ON VOLUME | [db.]merge_tree_family_table_name] Включает фоновые мержи для таблиц семейства MergeTree: ``` sql -SYSTEM START MERGES [ON VOLUME | [db.]merge_tree_family_table_name] +SYSTEM START MERGES [ON CLUSTER cluster_name] [ON VOLUME | [db.]merge_tree_family_table_name] ``` ### STOP TTL MERGES {#query_language-stop-ttl-merges} @@ -175,7 +183,7 @@ SYSTEM START MERGES [ON VOLUME | [db.]merge_tree_family_table_name Возвращает `Ok.` даже если указана несуществующая таблица или таблица имеет тип отличный от MergeTree. Возвращает ошибку если указана не существующая база данных: ``` sql -SYSTEM STOP TTL MERGES [[db.]merge_tree_family_table_name] +SYSTEM STOP TTL MERGES [ON CLUSTER cluster_name] [[db.]merge_tree_family_table_name] ``` ### START TTL MERGES {#query_language-start-ttl-merges} @@ -184,7 +192,7 @@ SYSTEM STOP TTL MERGES [[db.]merge_tree_family_table_name] Возвращает `Ok.` даже если указана несуществующая таблица или таблица имеет тип отличный от MergeTree. Возвращает ошибку если указана не существующая база данных: ``` sql -SYSTEM START TTL MERGES [[db.]merge_tree_family_table_name] +SYSTEM START TTL MERGES [ON CLUSTER cluster_name] [[db.]merge_tree_family_table_name] ``` ### STOP MOVES {#query_language-stop-moves} @@ -193,7 +201,7 @@ SYSTEM START TTL MERGES [[db.]merge_tree_family_table_name] Возвращает `Ok.` даже если указана несуществующая таблица или таблица имеет тип отличный от MergeTree. Возвращает ошибку если указана не существующая база данных: ``` sql -SYSTEM STOP MOVES [[db.]merge_tree_family_table_name] +SYSTEM STOP MOVES [ON CLUSTER cluster_name] [[db.]merge_tree_family_table_name] ``` ### START MOVES {#query_language-start-moves} @@ -202,7 +210,7 @@ SYSTEM STOP MOVES [[db.]merge_tree_family_table_name] Возвращает `Ok.` даже если указана несуществующая таблица или таблица имеет тип отличный от MergeTree. Возвращает ошибку если указана не существующая база данных: ``` sql -SYSTEM START MOVES [[db.]merge_tree_family_table_name] +SYSTEM START MOVES [ON CLUSTER cluster_name] [[db.]merge_tree_family_table_name] ``` ### SYSTEM UNFREEZE {#query_language-system-unfreeze} @@ -223,7 +231,7 @@ ClickHouse может управлять фоновыми процессами Всегда возвращает `Ok.` вне зависимости от типа таблицы и даже если таблица или база данных не существет. ``` sql -SYSTEM STOP FETCHES [[db.]replicated_merge_tree_family_table_name] +SYSTEM STOP FETCHES [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### START FETCHES {#query_language-system-start-fetches} @@ -232,7 +240,7 @@ SYSTEM STOP FETCHES [[db.]replicated_merge_tree_family_table_name] Всегда возвращает `Ok.` вне зависимости от типа таблицы и даже если таблица или база данных не существет. ``` sql -SYSTEM START FETCHES [[db.]replicated_merge_tree_family_table_name] +SYSTEM START FETCHES [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### STOP REPLICATED SENDS {#query_language-system-start-replicated-sends} @@ -240,7 +248,7 @@ SYSTEM START FETCHES [[db.]replicated_merge_tree_family_table_name] Позволяет остановить фоновые процессы отсылки новых вставленных кусков данных другим репликам в кластере для таблиц семейства `ReplicatedMergeTree`: ``` sql -SYSTEM STOP REPLICATED SENDS [[db.]replicated_merge_tree_family_table_name] +SYSTEM STOP REPLICATED SENDS [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### START REPLICATED SENDS {#query_language-system-start-replicated-sends} @@ -248,7 +256,7 @@ SYSTEM STOP REPLICATED SENDS [[db.]replicated_merge_tree_family_table_name] Позволяет запустить фоновые процессы отсылки новых вставленных кусков данных другим репликам в кластере для таблиц семейства `ReplicatedMergeTree`: ``` sql -SYSTEM START REPLICATED SENDS [[db.]replicated_merge_tree_family_table_name] +SYSTEM START REPLICATED SENDS [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### STOP REPLICATION QUEUES {#query_language-system-stop-replication-queues} @@ -256,7 +264,7 @@ SYSTEM START REPLICATED SENDS [[db.]replicated_merge_tree_family_table_name] Останавливает фоновые процессы разбора заданий из очереди репликации которая хранится в Zookeeper для таблиц семейства `ReplicatedMergeTree`. Возможные типы заданий - merges, fetches, mutation, DDL запросы с ON CLUSTER: ``` sql -SYSTEM STOP REPLICATION QUEUES [[db.]replicated_merge_tree_family_table_name] +SYSTEM STOP REPLICATION QUEUES [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### START REPLICATION QUEUES {#query_language-system-start-replication-queues} @@ -264,7 +272,7 @@ SYSTEM STOP REPLICATION QUEUES [[db.]replicated_merge_tree_family_table_name] Запускает фоновые процессы разбора заданий из очереди репликации которая хранится в Zookeeper для таблиц семейства `ReplicatedMergeTree`. Возможные типы заданий - merges, fetches, mutation, DDL запросы с ON CLUSTER: ``` sql -SYSTEM START REPLICATION QUEUES [[db.]replicated_merge_tree_family_table_name] +SYSTEM START REPLICATION QUEUES [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### SYNC REPLICA {#query_language-system-sync-replica} @@ -287,7 +295,7 @@ SYSTEM SYNC REPLICA [db.]replicated_merge_tree_family_table_name [STRICT | LIGHT Инициализация очереди репликации на основе данных ZooKeeper происходит так же, как при `ATTACH TABLE`. Некоторое время таблица будет недоступна для любых операций. ``` sql -SYSTEM RESTART REPLICA [db.]replicated_merge_tree_family_table_name +SYSTEM RESTART REPLICA [ON CLUSTER cluster_name] [db.]replicated_merge_tree_family_table_name ``` ### RESTORE REPLICA {#query_language-system-restore-replica} diff --git a/docs/zh/sql-reference/statements/system.md b/docs/zh/sql-reference/statements/system.md index 8fd2dd74d26..b41b62d72c3 100644 --- a/docs/zh/sql-reference/statements/system.md +++ b/docs/zh/sql-reference/statements/system.md @@ -71,10 +71,18 @@ SYSTEM DROP REPLICA 'replica_name' FROM ZKPATH '/path/to/table/in/zk'; 将日志信息缓冲数据刷入系统表(例如system.query_log)。调试时允许等待不超过7.5秒。当信息队列为空时,会创建系统表。 +```sql +SYSTEM FLUSH LOGS [ON CLUSTER cluster_name] +``` + ## RELOAD CONFIG {#query_language-system-reload-config} 重新加载ClickHouse的配置。用于当配置信息存放在ZooKeeper时。 +```sql +SYSTEM RELOAD CONFIG [ON CLUSTER cluster_name] +``` + ## SHUTDOWN {#query_language-system-shutdown} 关闭ClickHouse服务(类似于 `service clickhouse-server stop` / `kill {$pid_clickhouse-server}`) @@ -93,7 +101,7 @@ ClickHouse可以管理 [distribute](../../engines/table-engines/special/distribu 当向分布式表插入数据时,禁用后台的分布式数据分发。 ``` sql -SYSTEM STOP DISTRIBUTED SENDS [db.] +SYSTEM STOP DISTRIBUTED SENDS [db.] [ON CLUSTER cluster_name] ``` ### FLUSH DISTRIBUTED {#query_language-system-flush-distributed} @@ -101,7 +109,7 @@ SYSTEM STOP DISTRIBUTED SENDS [db.] 强制让ClickHouse同步向集群节点同步发送数据。如果有节点失效,ClickHouse抛出异常并停止插入操作。当所有节点都恢复上线时,你可以重试之前的操作直到成功执行。 ``` sql -SYSTEM FLUSH DISTRIBUTED [db.] +SYSTEM FLUSH DISTRIBUTED [db.] [ON CLUSTER cluster_name] ``` ### START DISTRIBUTED SENDS {#query_language-system-start-distributed-sends} @@ -109,7 +117,7 @@ SYSTEM FLUSH DISTRIBUTED [db.] 当向分布式表插入数据时,允许后台的分布式数据分发。 ``` sql -SYSTEM START DISTRIBUTED SENDS [db.] +SYSTEM START DISTRIBUTED SENDS [db.] [ON CLUSTER cluster_name] ``` ## Managing MergeTree Tables {#query-language-system-mergetree} @@ -121,7 +129,7 @@ ClickHouse可以管理 [MergeTree](../../engines/table-engines/mergetree-family/ 为MergeTree系列引擎表停止后台合并操作。 ``` sql -SYSTEM STOP MERGES [[db.]merge_tree_family_table_name] +SYSTEM STOP MERGES [ON CLUSTER cluster_name] [ON VOLUME | [db.]merge_tree_family_table_name] ``` :::note @@ -133,7 +141,7 @@ SYSTEM STOP MERGES [[db.]merge_tree_family_table_name] 为MergeTree系列引擎表启动后台合并操作。 ``` sql -SYSTEM START MERGES [[db.]merge_tree_family_table_name] +SYSTEM START MERGES [ON CLUSTER cluster_name] [ON VOLUME | [db.]merge_tree_family_table_name] ``` ### STOP TTL MERGES {#query_language-stop-ttl-merges} @@ -142,7 +150,7 @@ SYSTEM START MERGES [[db.]merge_tree_family_table_name] 不管表存在与否,都返回 `OK.`。当数据库不存在时返回错误。 ``` sql -SYSTEM STOP TTL MERGES [[db.]merge_tree_family_table_name] +SYSTEM STOP TTL MERGES [ON CLUSTER cluster_name] [[db.]merge_tree_family_table_name] ``` ### START TTL MERGES {#query_language-start-ttl-merges} @@ -151,7 +159,7 @@ SYSTEM STOP TTL MERGES [[db.]merge_tree_family_table_name] ``` sql -SYSTEM START TTL MERGES [[db.]merge_tree_family_table_name] +SYSTEM START TTL MERGES [ON CLUSTER cluster_name] [[db.]merge_tree_family_table_name] ``` ### STOP MOVES {#query_language-stop-moves} @@ -160,7 +168,7 @@ SYSTEM START TTL MERGES [[db.]merge_tree_family_table_name] ``` sql -SYSTEM STOP MOVES [[db.]merge_tree_family_table_name] +SYSTEM STOP MOVES [ON CLUSTER cluster_name] [[db.]merge_tree_family_table_name] ``` ### START MOVES {#query_language-start-moves} @@ -169,7 +177,7 @@ SYSTEM STOP MOVES [[db.]merge_tree_family_table_name] ``` sql -SYSTEM STOP MOVES [[db.]merge_tree_family_table_name] +SYSTEM START MOVES [ON CLUSTER cluster_name] [[db.]merge_tree_family_table_name] ``` ### SYSTEM UNFREEZE {#query_language-system-unfreeze} @@ -190,7 +198,7 @@ SYSTEM UNFREEZE WITH NAME 不管表引擎类型如何或表/数据库是否存,都返回 `OK.`。 ``` sql -SYSTEM STOP FETCHES [[db.]replicated_merge_tree_family_table_name] +SYSTEM STOP FETCHES [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### START FETCHES {#query_language-system-start-fetches} @@ -199,7 +207,7 @@ SYSTEM STOP FETCHES [[db.]replicated_merge_tree_family_table_name] 不管表引擎类型如何或表/数据库是否存,都返回 `OK.`。 ``` sql -SYSTEM START FETCHES [[db.]replicated_merge_tree_family_table_name] +SYSTEM START FETCHES [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### STOP REPLICATED SENDS {#query_language-system-start-replicated-sends} @@ -207,7 +215,7 @@ SYSTEM START FETCHES [[db.]replicated_merge_tree_family_table_name] 停止通过后台分发 `ReplicatedMergeTree`系列引擎表中新插入的数据块到集群的其它副本节点。 ``` sql -SYSTEM STOP REPLICATED SENDS [[db.]replicated_merge_tree_family_table_name] +SYSTEM STOP REPLICATED SENDS [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### START REPLICATED SENDS {#query_language-system-start-replicated-sends} @@ -215,7 +223,7 @@ SYSTEM STOP REPLICATED SENDS [[db.]replicated_merge_tree_family_table_name] 启动通过后台分发 `ReplicatedMergeTree`系列引擎表中新插入的数据块到集群的其它副本节点。 ``` sql -SYSTEM START REPLICATED SENDS [[db.]replicated_merge_tree_family_table_name] +SYSTEM START REPLICATED SENDS [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### STOP REPLICATION QUEUES {#query_language-system-stop-replication-queues} @@ -224,7 +232,7 @@ SYSTEM START REPLICATED SENDS [[db.]replicated_merge_tree_family_table_name] 停止从Zookeeper中获取 `ReplicatedMergeTree`系列表的复制队列的后台任务。可能的后台任务类型包含:merges, fetches, mutation,带有 `ON CLUSTER`的ddl语句 ``` sql -SYSTEM STOP REPLICATION QUEUES [[db.]replicated_merge_tree_family_table_name] +SYSTEM STOP REPLICATION QUEUES [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### START REPLICATION QUEUES {#query_language-system-start-replication-queues} @@ -232,7 +240,7 @@ SYSTEM STOP REPLICATION QUEUES [[db.]replicated_merge_tree_family_table_name] 启动从Zookeeper中获取 `ReplicatedMergeTree`系列表的复制队列的后台任务。可能的后台任务类型包含:merges, fetches, mutation,带有 `ON CLUSTER`的ddl语句 ``` sql -SYSTEM START REPLICATION QUEUES [[db.]replicated_merge_tree_family_table_name] +SYSTEM START REPLICATION QUEUES [ON CLUSTER cluster_name] [[db.]replicated_merge_tree_family_table_name] ``` ### SYNC REPLICA {#query_language-system-sync-replica} @@ -250,7 +258,7 @@ SYSTEM SYNC REPLICA [db.]replicated_merge_tree_family_table_name [STRICT | LIGHT ``` sql -SYSTEM RESTART REPLICA [db.]replicated_merge_tree_family_table_name +SYSTEM RESTART REPLICA [ON CLUSTER cluster_name] [db.]replicated_merge_tree_family_table_name ``` ### RESTART REPLICAS {#query_language-system-restart-replicas} From 8e4a2a42fbab37b1d4511d45d6b01580172689db Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 2 Jun 2023 16:51:09 +0200 Subject: [PATCH 0324/1072] Some code --- src/Access/Common/AccessType.h | 1 + .../AzureBlobStorage/AzureObjectStorage.h | 12 +- src/Storages/StorageAzure.cpp | 518 ++++++++++++++++++ src/Storages/StorageAzure.h | 121 ++++ src/Storages/registerStorages.cpp | 12 + 5 files changed, 659 insertions(+), 5 deletions(-) create mode 100644 src/Storages/StorageAzure.cpp create mode 100644 src/Storages/StorageAzure.h diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index 6394c0279a7..84c99939f2d 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -210,6 +210,7 @@ enum class AccessType M(HDFS, "", GLOBAL, SOURCES) \ M(S3, "", GLOBAL, SOURCES) \ M(HIVE, "", GLOBAL, SOURCES) \ + M(AZURE, "", GLOBAL, SOURCES) \ M(SOURCES, "", GROUP, ALL) \ \ M(CLUSTER, "", GLOBAL, ALL) /* ON CLUSTER queries */ \ diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 0c2aecd5c62..1be1eb69799 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -37,11 +37,13 @@ struct AzureObjectStorageSettings { } - size_t max_single_part_upload_size; /// NOTE: on 32-bit machines it will be at most 4GB, but size_t is also used in BufferBase for offset - uint64_t min_bytes_for_seek; - size_t max_single_read_retries; - size_t max_single_download_retries; - int list_object_keys_size; + AzureObjectStorageSettings() = default; + + size_t max_single_part_upload_size = 100 * 1024 * 1024; /// NOTE: on 32-bit machines it will be at most 4GB, but size_t is also used in BufferBase for offset + uint64_t min_bytes_for_seek = 1024 * 1024; + size_t max_single_read_retries = 3; + size_t max_single_download_retries = 3; + int list_object_keys_size = 1000; }; using AzureClient = Azure::Storage::Blobs::BlobContainerClient; diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp new file mode 100644 index 00000000000..30fd3fcbe95 --- /dev/null +++ b/src/Storages/StorageAzure.cpp @@ -0,0 +1,518 @@ +#include + + +#if USE_AZURE_BLOB_STORAGE +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +using namespace Azure::Storage::Blobs; + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int BAD_ARGUMENTS; + extern const int DATABASE_ACCESS_DENIED; +} + +bool isConnectionString(const std::string & candidate) +{ + return candidate.starts_with("DefaultEndpointsProtocol"); +} + +StorageAzure::Configuration StorageAzure::getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file) +{ + StorageAzure::Configuration configuration; + + /// Supported signatures: + /// + /// Azure(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression]) + + if (engine_args.size() < 3 || engine_args.size() > 7) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Storage Azure requires 3 to 7 arguments: " + "Azure(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression])"); + + for (auto & engine_arg : engine_args) + engine_arg = evaluateConstantExpressionOrIdentifierAsLiteral(engine_arg, local_context); + + std::unordered_map engine_args_to_idx; + + configuration.connection_url = checkAndGetLiteralArgument(engine_args[0], "connection_string/storage_account_url"); + configuration.is_connection_string = isConnectionString(configuration.connection_url); + + configuration.container = checkAndGetLiteralArgument(engine_args[1], "container"); + configuration.blob_path = checkAndGetLiteralArgument(engine_args[2], "blobpath"); + + auto is_format_arg = [] (const std::string & s) -> bool + { + return s == "auto" || FormatFactory::instance().getAllFormats().contains(s); + }; + + if (engine_args.size() == 4) + { + auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); + if (is_format_arg(fourth_arg)) + { + configuration.format = fourth_arg; + } + else + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format or account name specified without account key"); + } + } + else if (engine_args.size() == 5) + { + auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); + if (is_format_arg(fourth_arg)) + { + configuration.format = fourth_arg; + configuration.compression_method = checkAndGetLiteralArgument(engine_args[4], "compression"); + } + else + { + configuration.account_name = fourth_arg; + configuration.account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); + } + } + else if (engine_args.size() == 6) + { + auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); + if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format and compression must be last arguments"); + } + else + { + configuration.account_name = fourth_arg; + configuration.account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); + auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name"); + if (!is_format_arg(sixth_arg)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); + configuration.format = sixth_arg; + } + } + else if (engine_args.size() == 7) + { + auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); + if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format and compression must be last arguments"); + } + else + { + configuration.account_name = fourth_arg; + configuration.account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); + auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name"); + if (!is_format_arg(sixth_arg)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); + configuration.format = sixth_arg; + configuration.compression_method = checkAndGetLiteralArgument(engine_args[6], "compression"); + } + } + + configuration.blobs_paths = {configuration.blob_path}; + + if (configuration.format == "auto" && get_format_from_file) + configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + + return configuration; +} + + +void registerStorageAzure(StorageFactory & factory) +{ + factory.registerStorage("Azure", [](const StorageFactory::Arguments & args) + { + auto & engine_args = args.engine_args; + if (engine_args.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "External data source must have arguments"); + + auto configuration = StorageAzure::getConfiguration(engine_args, args.getLocalContext()); + auto client = StorageAzure::createClient(configuration); + // Use format settings from global server context + settings from + // the SETTINGS clause of the create query. Settings from current + // session and user are ignored. + std::optional format_settings; + if (args.storage_def->settings) + { + FormatFactorySettings user_format_settings; + + // Apply changed settings from global context, but ignore the + // unknown ones, because we only have the format settings here. + const auto & changes = args.getContext()->getSettingsRef().changes(); + for (const auto & change : changes) + { + if (user_format_settings.has(change.name)) + user_format_settings.set(change.name, change.value); + } + + // Apply changes from SETTINGS clause, with validation. + user_format_settings.applyChanges(args.storage_def->settings->changes); + format_settings = getFormatSettings(args.getContext(), user_format_settings); + } + else + { + format_settings = getFormatSettings(args.getContext()); + } + + ASTPtr partition_by; + if (args.storage_def->partition_by) + partition_by = args.storage_def->partition_by->clone(); + + return std::make_shared( + std::move(configuration), + std::make_unique("AzureStorage", std::move(client), std::make_unique()), + args.getContext(), + args.table_id, + args.columns, + args.constraints, + args.comment, + format_settings, + partition_by); + }, + { + .supports_settings = true, + .supports_sort_order = true, // for partition by + .supports_schema_inference = true, + .source_access_type = AccessType::AZURE, + }); +} + +AzureClientPtr StorageAzure::createClient(StorageAzure::Configuration configuration) +{ + AzureClientPtr result; + + if (configuration.is_connection_string) + { + result = std::make_unique(BlobContainerClient::CreateFromConnectionString(configuration.connection_url, configuration.container)); + } + else + { + if (configuration.account_name.has_value() && configuration.account_key.has_value()) + { + auto storage_shared_key_credential = std::make_shared(*configuration.account_name, *configuration.account_key); + result = std::make_unique(configuration.connection_url, storage_shared_key_credential); + } + + auto managed_identity_credential = std::make_shared(); + + result = std::make_unique(configuration.connection_url, managed_identity_credential); + } + + return result; +} + +StorageAzure::StorageAzure( + const Configuration & configuration_, + std::unique_ptr && object_storage_, + ContextPtr context_, + const StorageID & table_id_, + const ColumnsDescription & columns_, + const ConstraintsDescription & constraints_, + const String & comment, + std::optional format_settings_, + ASTPtr partition_by_) + : IStorage(table_id_) + , name("AzureBlobStorage") + , configuration(configuration_) + , object_storage(std::move(object_storage_)) + , distributed_processing(false) + , format_settings(format_settings_) + , partition_by(partition_by_) +{ + FormatFactory::instance().checkFormatName(configuration.format); + context_->getGlobalContext()->getRemoteHostFilter().checkURL(Poco::URI(configuration.getConnectionURL())); + + StorageInMemoryMetadata storage_metadata; + if (columns_.empty()) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Schema inference is not supported yet"); + //auto columns = getTableStructureFromDataImpl(configuration, format_settings, context_); + //storage_metadata.setColumns(columns); + } + else + storage_metadata.setColumns(columns_); + + storage_metadata.setConstraints(constraints_); + storage_metadata.setComment(comment); + setInMemoryMetadata(storage_metadata); + + auto default_virtuals = NamesAndTypesList{ + {"_path", std::make_shared(std::make_shared())}, + {"_file", std::make_shared(std::make_shared())}}; + + auto columns = storage_metadata.getSampleBlock().getNamesAndTypesList(); + virtual_columns = getVirtualsForStorage(columns, default_virtuals); + for (const auto & column : virtual_columns) + virtual_block.insert({column.type->createColumn(), column.type, column.name}); +} + +void StorageAzure::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &) +{ + + if (configuration.withGlobs()) + { + throw Exception( + ErrorCodes::DATABASE_ACCESS_DENIED, + "S3 key '{}' contains globs, so the table is in readonly mode", + configuration.blob_path); + } + + StoredObjects objects; + for (const auto & key : configuration.blobs_paths) + objects.emplace_back(key); + + object_storage->removeObjects(objects); +} + +namespace +{ + +class StorageAzureSink : public SinkToStorage +{ +public: + StorageAzureSink( + const String & format, + const Block & sample_block_, + ContextPtr context, + std::optional format_settings_, + const CompressionMethod compression_method, + AzureObjectStorage * object_storage, + const String & blob_path) + : SinkToStorage(sample_block_) + , sample_block(sample_block_) + , format_settings(format_settings_) + { + StoredObject object(blob_path); + write_buf = wrapWriteBufferWithCompressionMethod(object_storage->writeObject(object, WriteMode::Rewrite), compression_method, 3); + writer = FormatFactory::instance().getOutputFormatParallelIfPossible(format, *write_buf, sample_block, context, format_settings); + } + + String getName() const override { return "StorageS3Sink"; } + + void consume(Chunk chunk) override + { + std::lock_guard lock(cancel_mutex); + if (cancelled) + return; + writer->write(getHeader().cloneWithColumns(chunk.detachColumns())); + } + + void onCancel() override + { + std::lock_guard lock(cancel_mutex); + finalize(); + cancelled = true; + } + + void onException() override + { + std::lock_guard lock(cancel_mutex); + finalize(); + } + + void onFinish() override + { + std::lock_guard lock(cancel_mutex); + finalize(); + } + +private: + void finalize() + { + if (!writer) + return; + + try + { + writer->finalize(); + writer->flush(); + write_buf->finalize(); + } + catch (...) + { + /// Stop ParallelFormattingOutputFormat correctly. + writer.reset(); + write_buf->finalize(); + throw; + } + } + + Block sample_block; + std::optional format_settings; + std::unique_ptr write_buf; + OutputFormatPtr writer; + bool cancelled = false; + std::mutex cancel_mutex; +}; + +class PartitionedStorageAzureSink : public PartitionedSink +{ +public: + PartitionedStorageAzureSink( + const ASTPtr & partition_by, + const String & format_, + const Block & sample_block_, + ContextPtr context_, + std::optional format_settings_, + const CompressionMethod compression_method_, + AzureObjectStorage * object_storage_, + const String & blob_) + : PartitionedSink(partition_by, context_, sample_block_) + , format(format_) + , sample_block(sample_block_) + , context(context_) + , compression_method(compression_method_) + , object_storage(object_storage_) + , blob(blob_) + , format_settings(format_settings_) + { + } + + SinkPtr createSinkForPartition(const String & partition_id) override + { + auto partition_key = replaceWildcards(blob, partition_id); + validateKey(partition_key); + + return std::make_shared( + format, + sample_block, + context, + format_settings, + compression_method, + object_storage, + partition_key + ); + } + +private: + const String format; + const Block sample_block; + const ContextPtr context; + const CompressionMethod compression_method; + AzureObjectStorage * object_storage; + const String blob; + const std::optional format_settings; + + ExpressionActionsPtr partition_by_expr; + + static void validateKey(const String & str) + { + validatePartitionKey(str, true); + } +}; + +} + +SinkToStoragePtr StorageAzure::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) +{ + auto sample_block = metadata_snapshot->getSampleBlock(); + auto chosen_compression_method = chooseCompressionMethod(configuration.blobs_paths.back(), configuration.compression_method); + auto insert_query = std::dynamic_pointer_cast(query); + + auto partition_by_ast = insert_query ? (insert_query->partition_by ? insert_query->partition_by : partition_by) : nullptr; + bool is_partitioned_implementation = partition_by_ast && configuration.withWildcard(); + + if (is_partitioned_implementation) + { + return std::make_shared( + partition_by_ast, + configuration.format, + sample_block, + local_context, + format_settings, + chosen_compression_method, + object_storage.get(), + configuration.blobs_paths.back()); + } + else + { + if (configuration.withGlobs()) + throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, + "Azure key '{}' contains globs, so the table is in readonly mode", configuration.blob_path); + + bool truncate_in_insert = local_context->getSettingsRef().s3_truncate_on_insert; + + if (!truncate_in_insert && object_storage->exists(StoredObject(configuration.blob_path))) + { + if (local_context->getSettingsRef().s3_create_new_file_on_insert) + { + size_t index = configuration.blobs_paths.size(); + const auto & first_key = configuration.blobs_paths[0]; + auto pos = first_key.find_first_of('.'); + String new_key; + do + { + new_key = first_key.substr(0, pos) + "." + std::to_string(index) + (pos == std::string::npos ? "" : first_key.substr(pos)); + ++index; + } + while (object_storage->exists(StoredObject(new_key))); + + configuration.blobs_paths.push_back(new_key); + } + else + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Object in bucket {} with key {} already exists. " + "If you want to overwrite it, enable setting s3_truncate_on_insert, if you " + "want to create a new file on each insert, enable setting s3_create_new_file_on_insert", + configuration.container, configuration.blobs_paths.back()); + } + } + + return std::make_shared( + configuration.format, + sample_block, + local_context, + format_settings, + chosen_compression_method, + object_storage.get(), + configuration.blobs_paths.back()); + } +} + +NamesAndTypesList StorageAzure::getVirtuals() const +{ + return virtual_columns; +} + +bool StorageAzure::supportsPartitionBy() const +{ + return true; +} + +} + +#endif diff --git a/src/Storages/StorageAzure.h b/src/Storages/StorageAzure.h new file mode 100644 index 00000000000..b99df2e89a5 --- /dev/null +++ b/src/Storages/StorageAzure.h @@ -0,0 +1,121 @@ +#pragma once + +#include "config.h" + +#if USE_AZURE_BLOB_STORAGE + +#include +#include +#include +#include +#include + +namespace DB +{ + +struct AzureSimpleAccountConfiguration +{ + std::string storage_account_url; +}; + +using AzureConnectionString = std::string; + +using AzureCredentials = std::variant; + +class StorageAzure : public IStorage +{ +public: + using AzureClient = Azure::Storage::Blobs::BlobContainerClient; + using AzureClientPtr = std::unique_ptr; + + struct Configuration : public StatelessTableEngineConfiguration + { + Configuration() = default; + + String getPath() const { return blob_path; } + + bool update(ContextPtr context); + + void connect(ContextPtr context); + + bool withGlobs() const { return blob_path.find_first_of("*?{") != std::string::npos; } + + bool withWildcard() const + { + static const String PARTITION_ID_WILDCARD = "{_partition_id}"; + return blobs_paths.back().find(PARTITION_ID_WILDCARD) != String::npos; + } + + std::string getConnectionURL() const + { + if (!is_connection_string) + return connection_url; + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Connection string not implemented yet"); + } + + std::string connection_url; + bool is_connection_string; + + std::optional account_name; + std::optional account_key; + + std::string container; + std::string blob_path; + std::vector blobs_paths; + }; + + StorageAzure( + const Configuration & configuration_, + std::unique_ptr && object_storage_, + ContextPtr context_, + const StorageID & table_id_, + const ColumnsDescription & columns_, + const ConstraintsDescription & constraints_, + const String & comment, + std::optional format_settings_, + ASTPtr partition_by_); + + static StorageAzure::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file = true); + static AzureClientPtr createClient(StorageAzure::Configuration configuration); + + String getName() const override + { + return name; + } + + Pipe read( + const Names & column_names, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & query_info, + ContextPtr context, + QueryProcessingStage::Enum processed_stage, + size_t max_block_size, + size_t num_streams) override; + + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /* metadata_snapshot */, ContextPtr context) override; + + void truncate(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, TableExclusiveLockHolder &) override; + + NamesAndTypesList getVirtuals() const override; + + bool supportsPartitionBy() const override; + + static SchemaCache & getSchemaCache(const ContextPtr & ctx); + +private: + std::string name; + Configuration configuration; + std::unique_ptr object_storage; + NamesAndTypesList virtual_columns; + Block virtual_block; + + const bool distributed_processing; + std::optional format_settings; + ASTPtr partition_by; + +}; + +} + +#endif diff --git a/src/Storages/registerStorages.cpp b/src/Storages/registerStorages.cpp index 8be176a5375..03bd4dbb310 100644 --- a/src/Storages/registerStorages.cpp +++ b/src/Storages/registerStorages.cpp @@ -94,8 +94,16 @@ void registerStorageFileLog(StorageFactory & factory); void registerStorageSQLite(StorageFactory & factory); #endif + + void registerStorageKeeperMap(StorageFactory & factory); +#if USE_AZURE_BLOB_STORAGE + +void registerStorageAzure(StorageFactory & factory); + +#endif + void registerStorages() { auto & factory = StorageFactory::instance(); @@ -191,6 +199,10 @@ void registerStorages() #endif registerStorageKeeperMap(factory); + + #if USE_AZURE_BLOB_STORAGE + registerStorageAzure(factory); + #endif } } From 897325967841cbd24de32f3a136ceb26385b75b9 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 2 Jun 2023 15:00:24 +0000 Subject: [PATCH 0325/1072] Test attach gdb in stateless tests --- docker/test/stateless/run.sh | 39 ++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index 5d0a7b50741..dfee7d84cde 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -85,6 +85,45 @@ fi sleep 5 +# Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog +# and clickhouse-server can do fork-exec, for example, to run some bridge. +# Do not set nostop noprint for all signals, because some it may cause gdb to hang, +# explicitly ignore non-fatal signals that are used by server. +# Number of SIGRTMIN can be determined only in runtime. +RTMIN=$(kill -l SIGRTMIN) +echo " +set follow-fork-mode parent +handle SIGHUP nostop noprint pass +handle SIGINT nostop noprint pass +handle SIGQUIT nostop noprint pass +handle SIGPIPE nostop noprint pass +handle SIGTERM nostop noprint pass +handle SIGUSR1 nostop noprint pass +handle SIGUSR2 nostop noprint pass +handle SIG$RTMIN nostop noprint pass +info signals +continue +backtrace full +thread apply all backtrace full +info registers +disassemble /s +up +disassemble /s +up +disassemble /s +p \"done\" +detach +quit +" > script.gdb + +# FIXME Hung check may work incorrectly because of attached gdb +# 1. False positives are possible +# 2. We cannot attach another gdb to get stacktraces if some queries hung +gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log & +sleep 5 +# gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s) +time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||: + function run_tests() { set -x From 963641b20ffd09c01107fe8b210d38218ca85161 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 2 Jun 2023 18:24:56 +0300 Subject: [PATCH 0326/1072] disable 00534_functions_bad_arguments with msan (#50481) --- tests/queries/0_stateless/00534_functions_bad_arguments10.sh | 2 +- tests/queries/0_stateless/00534_functions_bad_arguments5.sh | 2 +- tests/queries/0_stateless/00534_functions_bad_arguments6.sh | 2 +- tests/queries/0_stateless/00534_functions_bad_arguments9.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/00534_functions_bad_arguments10.sh b/tests/queries/0_stateless/00534_functions_bad_arguments10.sh index 8525b63a989..b9733f92812 100755 --- a/tests/queries/0_stateless/00534_functions_bad_arguments10.sh +++ b/tests/queries/0_stateless/00534_functions_bad_arguments10.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-tsan, no-debug +# Tags: no-tsan, no-debug, no-msan # Tag no-tsan: Too long for TSan # shellcheck disable=SC2016 diff --git a/tests/queries/0_stateless/00534_functions_bad_arguments5.sh b/tests/queries/0_stateless/00534_functions_bad_arguments5.sh index a8b0ce77677..812ba9f97fa 100755 --- a/tests/queries/0_stateless/00534_functions_bad_arguments5.sh +++ b/tests/queries/0_stateless/00534_functions_bad_arguments5.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-tsan, no-debug, no-fasttest +# Tags: no-tsan, no-debug, no-fasttest, no-msan # Tag no-tsan: Too long for TSan # shellcheck disable=SC2016 diff --git a/tests/queries/0_stateless/00534_functions_bad_arguments6.sh b/tests/queries/0_stateless/00534_functions_bad_arguments6.sh index b0080c3b418..6626a6dfe55 100755 --- a/tests/queries/0_stateless/00534_functions_bad_arguments6.sh +++ b/tests/queries/0_stateless/00534_functions_bad_arguments6.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-tsan, no-debug +# Tags: no-tsan, no-debug, no-msan # Tag no-tsan: Too long for TSan # shellcheck disable=SC2016 diff --git a/tests/queries/0_stateless/00534_functions_bad_arguments9.sh b/tests/queries/0_stateless/00534_functions_bad_arguments9.sh index 2975643020b..c7659db8621 100755 --- a/tests/queries/0_stateless/00534_functions_bad_arguments9.sh +++ b/tests/queries/0_stateless/00534_functions_bad_arguments9.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-tsan, no-debug +# Tags: no-tsan, no-debug, no-msan # Tag no-tsan: Too long for TSan # shellcheck disable=SC2016 From a0901b1d1cb938759e6bcca37d0b03df0c1929e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Fri, 2 Jun 2023 15:33:38 +0000 Subject: [PATCH 0327/1072] Add tests --- .../02771_system_user_processes.reference | 6 ++++++ .../0_stateless/02771_system_user_processes.sh | 15 +++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 tests/queries/0_stateless/02771_system_user_processes.reference create mode 100755 tests/queries/0_stateless/02771_system_user_processes.sh diff --git a/tests/queries/0_stateless/02771_system_user_processes.reference b/tests/queries/0_stateless/02771_system_user_processes.reference new file mode 100644 index 00000000000..ab0ff41ddc5 --- /dev/null +++ b/tests/queries/0_stateless/02771_system_user_processes.reference @@ -0,0 +1,6 @@ +0 +0 +default +test_user_02771 +default true true +test_user_02771 2 2 diff --git a/tests/queries/0_stateless/02771_system_user_processes.sh b/tests/queries/0_stateless/02771_system_user_processes.sh new file mode 100755 index 00000000000..e8bf88a9fb2 --- /dev/null +++ b/tests/queries/0_stateless/02771_system_user_processes.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "DROP USER IF EXISTS test_user_02771" +$CLICKHOUSE_CLIENT -q "CREATE USER test_user_02771" +$CLICKHOUSE_CLIENT -u test_user_02771 -q "SELECT * FROM system.numbers LIMIT 1" +$CLICKHOUSE_CLIENT -u test_user_02771 -q "SELECT * FROM system.numbers LIMIT 1" +$CLICKHOUSE_CLIENT -q "SELECT user FROM system.user_processes" +$CLICKHOUSE_CLIENT -q "SELECT user, toBool(ProfileEvents['SelectQuery'] > 0), toBool(ProfileEvents['Query'] > 0) FROM system.user_processes WHERE user='default'" +$CLICKHOUSE_CLIENT -q "SELECT user, ProfileEvents['SelectQuery'], ProfileEvents['Query'] FROM system.user_processes WHERE user='test_user_02771'" +$CLICKHOUSE_CLIENT -q "DROP USER test_user_02771" + From 4209ccfc088f832c49d7c530b517e73a2661504a Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 2 Jun 2023 17:42:04 +0200 Subject: [PATCH 0328/1072] fix --- .../MergeTree/ReplicatedMergeTreeSink.h | 1 + src/Storages/StorageReplicatedMergeTree.cpp | 53 +++++++++++-------- tests/config/config.d/merge_tree.xml | 2 + tests/integration/test_ttl_replicated/test.py | 6 +-- 4 files changed, 37 insertions(+), 25 deletions(-) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h index 3efd364fc9c..8d9e2e14129 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h @@ -87,6 +87,7 @@ private: size_t checkQuorumPrecondition(const ZooKeeperWithFaultInjectionPtr & zookeeper); /// Rename temporary part and commit to ZooKeeper. + /// Returns a list of conflicting async blocks and true if the whole parts was deduplicated std::pair, bool> commitPart( const ZooKeeperWithFaultInjectionPtr & zookeeper, MergeTreeData::MutableDataPartPtr & part, diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 44403fc708b..0a61369e163 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -1936,6 +1936,30 @@ MutableDataPartStoragePtr StorageReplicatedMergeTree::executeFetchShared( } } +static void paranoidCheckForCoveredPartsInZooKeeper(const ZooKeeperPtr & zookeeper, const String & replica_path, + MergeTreeDataFormatVersion format_version, const String & covering_part_name) +{ +#ifdef ABORT_ON_LOGICAL_ERROR + constexpr bool paranoid_check_for_covered_parts_default = true; +#else + constexpr bool paranoid_check_for_covered_parts_default = false; +#endif + + bool paranoid_check_for_covered_parts = Context::getGlobalContextInstance()->getConfigRef().getBool( + "replicated_merge_tree_paranoid_check_on_drop_range", paranoid_check_for_covered_parts_default); + if (paranoid_check_for_covered_parts) + { + auto drop_range_info = MergeTreePartInfo::fromPartName(covering_part_name, format_version); + Strings parts_remain = zookeeper->getChildren(replica_path + "/parts"); + for (const auto & part_name : parts_remain) + { + auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version); + if (drop_range_info.contains(part_info)) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Part {} remains in ZooKeeper after DROP_RANGE {}", part_name, covering_part_name); + } + } +} void StorageReplicatedMergeTree::executeDropRange(const LogEntry & entry) { @@ -1992,16 +2016,7 @@ void StorageReplicatedMergeTree::executeDropRange(const LogEntry & entry) /// Forcibly remove parts from ZooKeeper removePartsFromZooKeeperWithRetries(parts_to_remove); - -#ifdef ABORT_ON_LOGICAL_ERROR - Strings parts_remain = getZooKeeper()->getChildren(replica_path + "/parts"); - for (const auto & part_name : parts_remain) - { - auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version); - if (drop_range_info.contains(part_info)) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Part {} remains in ZooKeeper after DROP_RANGE {}", part_name, entry.new_part_name); - } -#endif + paranoidCheckForCoveredPartsInZooKeeper(getZooKeeper(), replica_path, format_version, entry.new_part_name); if (entry.detach) LOG_DEBUG(log, "Detached {} parts inside {}.", parts_to_remove.size(), entry.new_part_name); @@ -2137,6 +2152,8 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) { LOG_INFO(log, "All parts from REPLACE PARTITION command have been already attached"); removePartsFromZooKeeperWithRetries(parts_to_remove); + if (replace) + paranoidCheckForCoveredPartsInZooKeeper(getZooKeeper(), replica_path, format_version, entry_replace.drop_range_part_name); return true; } @@ -2433,6 +2450,8 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) } removePartsFromZooKeeperWithRetries(parts_to_remove); + if (replace) + paranoidCheckForCoveredPartsInZooKeeper(getZooKeeper(), replica_path, format_version, entry_replace.drop_range_part_name); res_parts.clear(); parts_to_remove.clear(); cleanup_thread.wakeup(); @@ -7133,7 +7152,6 @@ void StorageReplicatedMergeTree::replacePartitionFrom( clearBlocksInPartition(*zookeeper, drop_range.partition_id, drop_range.max_block, drop_range.max_block); } - PartsToRemoveFromZooKeeper parts_to_remove; Coordination::Responses op_results; try @@ -7184,7 +7202,7 @@ void StorageReplicatedMergeTree::replacePartitionFrom( auto data_parts_lock = lockParts(); transaction.commit(&data_parts_lock); if (replace) - parts_to_remove = removePartsInRangeFromWorkingSetAndGetPartsToRemoveFromZooKeeper(NO_TRANSACTION_RAW, drop_range, data_parts_lock); + removePartsInRangeFromWorkingSet(NO_TRANSACTION_RAW, drop_range, data_parts_lock); } PartLog::addNewParts(getContext(), PartLog::createPartLogEntries(dst_parts, watch.elapsed(), profile_events_scope.getSnapshot())); @@ -7204,11 +7222,6 @@ void StorageReplicatedMergeTree::replacePartitionFrom( for (auto & lock : ephemeral_locks) lock.assumeUnlocked(); - /// Forcibly remove replaced parts from ZooKeeper - removePartsFromZooKeeperWithRetries(parts_to_remove); - - /// Speedup removing of replaced parts from filesystem - parts_to_remove.clear(); cleanup_thread.wakeup(); lock2.reset(); @@ -7377,7 +7390,6 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta clearBlocksInPartition(*zookeeper, drop_range.partition_id, drop_range.max_block, drop_range.max_block); - PartsToRemoveFromZooKeeper parts_to_remove; Coordination::Responses op_results; try @@ -7414,7 +7426,7 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta else zkutil::KeeperMultiException::check(code, ops, op_results); - parts_to_remove = removePartsInRangeFromWorkingSetAndGetPartsToRemoveFromZooKeeper(NO_TRANSACTION_RAW, drop_range, src_data_parts_lock); + removePartsInRangeFromWorkingSet(NO_TRANSACTION_RAW, drop_range, src_data_parts_lock); transaction.commit(&src_data_parts_lock); } @@ -7436,9 +7448,6 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta for (auto & lock : ephemeral_locks) lock.assumeUnlocked(); - removePartsFromZooKeeperWithRetries(parts_to_remove); - - parts_to_remove.clear(); cleanup_thread.wakeup(); lock2.reset(); diff --git a/tests/config/config.d/merge_tree.xml b/tests/config/config.d/merge_tree.xml index bf2da9b09a2..bee9812274c 100644 --- a/tests/config/config.d/merge_tree.xml +++ b/tests/config/config.d/merge_tree.xml @@ -3,4 +3,6 @@ 1 8 + + 1 diff --git a/tests/integration/test_ttl_replicated/test.py b/tests/integration/test_ttl_replicated/test.py index a3e7d6e4b8b..39d66d857ff 100644 --- a/tests/integration/test_ttl_replicated/test.py +++ b/tests/integration/test_ttl_replicated/test.py @@ -517,7 +517,7 @@ def test_ttl_compatibility(started_cluster, node_left, node_right, num_run): ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/test_ttl_delete_{suff}', '{replica}') ORDER BY id PARTITION BY toDayOfMonth(date) TTL date + INTERVAL 3 SECOND - SETTINGS max_number_of_merges_with_ttl_in_pool=100, max_replicated_merges_with_ttl_in_queue=100 + SETTINGS max_number_of_merges_with_ttl_in_pool=100, max_replicated_merges_with_ttl_in_queue=100, remove_empty_parts=0 """.format( suff=num_run, replica=node.name ) @@ -529,7 +529,7 @@ def test_ttl_compatibility(started_cluster, node_left, node_right, num_run): ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/test_ttl_group_by_{suff}', '{replica}') ORDER BY id PARTITION BY toDayOfMonth(date) TTL date + INTERVAL 3 SECOND GROUP BY id SET val = sum(val) - SETTINGS max_number_of_merges_with_ttl_in_pool=100, max_replicated_merges_with_ttl_in_queue=100 + SETTINGS max_number_of_merges_with_ttl_in_pool=100, max_replicated_merges_with_ttl_in_queue=100, remove_empty_parts=0 """.format( suff=num_run, replica=node.name ) @@ -541,7 +541,7 @@ def test_ttl_compatibility(started_cluster, node_left, node_right, num_run): ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/test_ttl_where_{suff}', '{replica}') ORDER BY id PARTITION BY toDayOfMonth(date) TTL date + INTERVAL 3 SECOND DELETE WHERE id % 2 = 1 - SETTINGS max_number_of_merges_with_ttl_in_pool=100, max_replicated_merges_with_ttl_in_queue=100 + SETTINGS max_number_of_merges_with_ttl_in_pool=100, max_replicated_merges_with_ttl_in_queue=100, remove_empty_parts=0 """.format( suff=num_run, replica=node.name ) From 179b6aca6aec1962fe3690d037aa0dae27531f5d Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 2 Jun 2023 17:52:57 +0200 Subject: [PATCH 0329/1072] fix --- src/Storages/StorageReplicatedMergeTree.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 0a61369e163..5b7616d5f28 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -2846,6 +2846,8 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo } LOG_DEBUG(log, "Copied {} queue entries, {} entries ignored", total_entries_to_copy, source_queue.size() - total_entries_to_copy); + LOG_TRACE(log, "Parts in ZooKeeper after mimic: {}", fmt::join(zookeeper->getChildren(replica_path + "/parts"), ", ")); + LOG_TRACE(log, "Enqueued fetches after mimic: {}", fmt::join(created_get_parts, ", ")); } From 163b2f32da72a66d44967439446331b9943361f3 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Fri, 2 Jun 2023 18:13:46 +0200 Subject: [PATCH 0330/1072] Update src/Common/AsynchronousMetrics.cpp --- src/Common/AsynchronousMetrics.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/AsynchronousMetrics.cpp b/src/Common/AsynchronousMetrics.cpp index cf8d451385b..8cd33521cbb 100644 --- a/src/Common/AsynchronousMetrics.cpp +++ b/src/Common/AsynchronousMetrics.cpp @@ -948,7 +948,7 @@ void AsynchronousMetrics::update(TimePoint update_time) std::string line; readText(line, *cgroupcpu_max); - auto space = line.find_first_of(" "); + auto space = line.find(" "); if (line.rfind("max", space) == std::string::npos) { From 9d0a63bd9263ba9dc9f9181d2fdd9898d269530f Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Fri, 2 Jun 2023 18:15:20 +0200 Subject: [PATCH 0331/1072] Update src/Common/AsynchronousMetrics.cpp --- src/Common/AsynchronousMetrics.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/AsynchronousMetrics.cpp b/src/Common/AsynchronousMetrics.cpp index 8cd33521cbb..6821647a180 100644 --- a/src/Common/AsynchronousMetrics.cpp +++ b/src/Common/AsynchronousMetrics.cpp @@ -948,7 +948,7 @@ void AsynchronousMetrics::update(TimePoint update_time) std::string line; readText(line, *cgroupcpu_max); - auto space = line.find(" "); + auto space = line.find(' '); if (line.rfind("max", space) == std::string::npos) { From da4d55cdaf4e25e16ddbf9028e6c8f5d336c60f6 Mon Sep 17 00:00:00 2001 From: Valentin Alexeev Date: Fri, 2 Jun 2023 14:02:26 +0200 Subject: [PATCH 0332/1072] Additional error information when JSON is too large If a parser fails on a large JSON, then output the last position processed to allow review. --- src/Formats/JSONUtils.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Formats/JSONUtils.cpp b/src/Formats/JSONUtils.cpp index 284cffdb9d7..7b7c073b6b2 100644 --- a/src/Formats/JSONUtils.cpp +++ b/src/Formats/JSONUtils.cpp @@ -45,9 +45,9 @@ namespace JSONUtils const auto current_object_size = memory.size() + static_cast(pos - in.position()); if (min_bytes != 0 && current_object_size > 10 * min_bytes) throw ParsingException(ErrorCodes::INCORRECT_DATA, - "Size of JSON object is extremely large. Expected not greater than {} bytes, but current is {} bytes per row. " + "Size of JSON object at position {} is extremely large. Expected not greater than {} bytes, but current is {} bytes per row. " "Increase the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, " - "most likely JSON is malformed", min_bytes, current_object_size); + "most likely JSON is malformed", pos, min_bytes, current_object_size); if (quotes) { From 71c5b1d9c67b8a8fca252aa56932b13deeaa12f3 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Fri, 2 Jun 2023 12:42:35 -0400 Subject: [PATCH 0333/1072] add svg function docs --- docs/en/sql-reference/functions/geo/svg.md | 52 ++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 docs/en/sql-reference/functions/geo/svg.md diff --git a/docs/en/sql-reference/functions/geo/svg.md b/docs/en/sql-reference/functions/geo/svg.md new file mode 100644 index 00000000000..9081ac71338 --- /dev/null +++ b/docs/en/sql-reference/functions/geo/svg.md @@ -0,0 +1,52 @@ +--- +slug: /en/sql-reference/functions/geo/svg +sidebar_label: SVG +title: "Functions for Generating SVG images from Geo data" +--- + +## Syntax + +``` sql +SVG(geometry,[style]) +``` + +### Parameters + +- `geometry` — Geo data +- `style` — Optional style name + +### Returned value + +- The SVG representation of the geometry: + - SVG circle + - SVG polygon + - SVG path + +Type: String + +## Examples + +### Circle +```sql +SELECT SVG((0., 0.)) +``` +```response + +``` + +### Polygon +```sql +SELECT SVG([(0., 0.), (10, 0), (10, 10), (0, 10)]) +``` +```response + +``` + +### Path +```sql +SELECT SVG([[(0., 0.), (10, 0), (10, 10), (0, 10)], [(4., 4.), (5, 4), (5, 5), (4, 5)]]) +``` +```response + +``` + From 516cda94eeb6c822b12697fd32921cc79ea97c15 Mon Sep 17 00:00:00 2001 From: Valentin Alexeev Date: Fri, 2 Jun 2023 17:14:21 +0200 Subject: [PATCH 0334/1072] Use in.count() instead of pos --- src/Formats/JSONUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Formats/JSONUtils.cpp b/src/Formats/JSONUtils.cpp index 7b7c073b6b2..0aac72c68fe 100644 --- a/src/Formats/JSONUtils.cpp +++ b/src/Formats/JSONUtils.cpp @@ -47,7 +47,7 @@ namespace JSONUtils throw ParsingException(ErrorCodes::INCORRECT_DATA, "Size of JSON object at position {} is extremely large. Expected not greater than {} bytes, but current is {} bytes per row. " "Increase the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, " - "most likely JSON is malformed", pos, min_bytes, current_object_size); + "most likely JSON is malformed", in.count(), min_bytes, current_object_size); if (quotes) { From 9b8975194821fe44018ed5bcbc9d5ae088b970f5 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Fri, 2 Jun 2023 12:52:06 -0400 Subject: [PATCH 0335/1072] add docs for parallel view processing --- docs/en/operations/settings/settings.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 6c951739d41..5730503a670 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4229,6 +4229,12 @@ Default value: `2000` If it's enabled, in hedged requests we can start new connection until receiving first data packet even if we have already made some progress (but progress haven't updated for `receive_data_timeout` timeout), otherwise we disable changing replica after the first time we made progress. +## parallel_view_processing + +Enables pushing to attached views concurrently instead of sequentially. + +Default value: `false`. + ## partial_result_on_first_cancel {#partial_result_on_first_cancel} When set to `true` and the user wants to interrupt a query (for example using `Ctrl+C` on the client), then the query continues execution only on data that was already read from the table. Afterwards, it will return a partial result of the query for the part of the table that was read. To fully stop the execution of a query without a partial result, the user should send 2 cancel requests. From d28b4181e94c5602b5512af8ed541dcc2a1a55f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Fri, 2 Jun 2023 16:57:36 +0000 Subject: [PATCH 0336/1072] Add `SHOW USER PROCESSES` query --- src/Interpreters/InterpreterFactory.cpp | 6 ++++ .../InterpreterShowUserProcessesQuery.cpp | 18 +++++++++++ .../InterpreterShowUserProcessesQuery.h | 30 +++++++++++++++++ src/Parsers/ASTShowUserProcessesQuery.h | 17 ++++++++++ src/Parsers/ParserQueryWithOutput.cpp | 5 ++- src/Parsers/ParserShowUserProcessesQuery.h | 32 +++++++++++++++++++ 6 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 src/Interpreters/InterpreterShowUserProcessesQuery.cpp create mode 100644 src/Interpreters/InterpreterShowUserProcessesQuery.h create mode 100644 src/Parsers/ASTShowUserProcessesQuery.h create mode 100644 src/Parsers/ParserShowUserProcessesQuery.h diff --git a/src/Interpreters/InterpreterFactory.cpp b/src/Interpreters/InterpreterFactory.cpp index 9cd1f2a251c..c31e3801478 100644 --- a/src/Interpreters/InterpreterFactory.cpp +++ b/src/Interpreters/InterpreterFactory.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +82,7 @@ #include #include #include +#include #include #include #include @@ -266,6 +268,10 @@ std::unique_ptr InterpreterFactory::get(ASTPtr & query, ContextMut { return std::make_unique(query, context); } + else if (query->as()) + { + return std::make_unique(query, context); + } else if (query->as()) { return std::make_unique(query, context); diff --git a/src/Interpreters/InterpreterShowUserProcessesQuery.cpp b/src/Interpreters/InterpreterShowUserProcessesQuery.cpp new file mode 100644 index 00000000000..51287a7ad5b --- /dev/null +++ b/src/Interpreters/InterpreterShowUserProcessesQuery.cpp @@ -0,0 +1,18 @@ +#include + +#include +#include +#include + +#include + + +namespace DB +{ + +BlockIO InterpreterShowUserProcessesQuery::execute() +{ + return executeQuery("SELECT * FROM system.user_processes ORDER BY user DESC", getContext(), true); +} + +} diff --git a/src/Interpreters/InterpreterShowUserProcessesQuery.h b/src/Interpreters/InterpreterShowUserProcessesQuery.h new file mode 100644 index 00000000000..a1c385dc82f --- /dev/null +++ b/src/Interpreters/InterpreterShowUserProcessesQuery.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include + + +namespace DB +{ + +/** Return list of currently executing queries. +TODO(antaljanosbenjamin) + */ +class InterpreterShowUserProcessesQuery : public IInterpreter, WithMutableContext +{ +public: + InterpreterShowUserProcessesQuery(const ASTPtr & query_ptr_, ContextMutablePtr context_) + : WithMutableContext(context_), query_ptr(query_ptr_) {} + + BlockIO execute() override; + + /// We ignore the quota and limits here because execute() will rewrite a show query as a SELECT query and then + /// the SELECT query will checks the quota and limits. + bool ignoreQuota() const override { return true; } + bool ignoreLimits() const override { return true; } + +private: + ASTPtr query_ptr; +}; + +} diff --git a/src/Parsers/ASTShowUserProcessesQuery.h b/src/Parsers/ASTShowUserProcessesQuery.h new file mode 100644 index 00000000000..cd522c152b6 --- /dev/null +++ b/src/Parsers/ASTShowUserProcessesQuery.h @@ -0,0 +1,17 @@ +#pragma once + +#include + + +namespace DB +{ + +struct ASTShowUserProcessesIDAndQueryNames +{ + static constexpr auto ID = "ShowUserProcesses"; + static constexpr auto Query = "SHOW USER PROCESSES"; +}; + +using ASTShowUserProcessesQuery = ASTQueryWithOutputImpl; + +} diff --git a/src/Parsers/ParserQueryWithOutput.cpp b/src/Parsers/ParserQueryWithOutput.cpp index 6796f4528c4..d5293e5f709 100644 --- a/src/Parsers/ParserQueryWithOutput.cpp +++ b/src/Parsers/ParserQueryWithOutput.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -61,6 +62,7 @@ bool ParserQueryWithOutput::parseImpl(Pos & pos, ASTPtr & node, Expected & expec ParserShowGrantsQuery show_grants_p; ParserShowPrivilegesQuery show_privileges_p; ParserExplainQuery explain_p(end, allow_settings_after_format_in_insert); + ParserShowUserProcessesQuery show_user_processes_p; ASTPtr query; @@ -88,7 +90,8 @@ bool ParserQueryWithOutput::parseImpl(Pos & pos, ASTPtr & node, Expected & expec || show_access_p.parse(pos, query, expected) || show_access_entities_p.parse(pos, query, expected) || show_grants_p.parse(pos, query, expected) - || show_privileges_p.parse(pos, query, expected); + || show_privileges_p.parse(pos, query, expected) + || show_user_processes_p.parse(pos, query, expected); if (!parsed) return false; diff --git a/src/Parsers/ParserShowUserProcessesQuery.h b/src/Parsers/ParserShowUserProcessesQuery.h new file mode 100644 index 00000000000..be484e74d5d --- /dev/null +++ b/src/Parsers/ParserShowUserProcessesQuery.h @@ -0,0 +1,32 @@ +#pragma once + +#include +#include +#include +#include + + +namespace DB +{ + +/** Query SHOW USER PROCESSES + */ +class ParserShowUserProcessesQuery : public IParserBase +{ +protected: + const char * getName() const override { return "SHOW USER PROCESSES query"; } + + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override + { + auto query = std::make_shared(); + + if (!ParserKeyword("SHOW USER PROCESSES").ignore(pos, expected)) + return false; + + node = query; + + return true; + } +}; + +} From 96fe4b5107611a627b7981fdac2afe9304660e48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Fri, 2 Jun 2023 16:57:46 +0000 Subject: [PATCH 0337/1072] Add tests --- .../02771_system_user_processes.reference | 5 ++--- .../02771_system_user_processes.sh | 19 +++++++++++-------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/tests/queries/0_stateless/02771_system_user_processes.reference b/tests/queries/0_stateless/02771_system_user_processes.reference index ab0ff41ddc5..8c8ca8abb52 100644 --- a/tests/queries/0_stateless/02771_system_user_processes.reference +++ b/tests/queries/0_stateless/02771_system_user_processes.reference @@ -1,6 +1,5 @@ +SHOW USER PROCESSES query succeeded! 0 0 -default -test_user_02771 default true true -test_user_02771 2 2 +2 2 diff --git a/tests/queries/0_stateless/02771_system_user_processes.sh b/tests/queries/0_stateless/02771_system_user_processes.sh index e8bf88a9fb2..910af4be9e2 100755 --- a/tests/queries/0_stateless/02771_system_user_processes.sh +++ b/tests/queries/0_stateless/02771_system_user_processes.sh @@ -4,12 +4,15 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -$CLICKHOUSE_CLIENT -q "DROP USER IF EXISTS test_user_02771" -$CLICKHOUSE_CLIENT -q "CREATE USER test_user_02771" -$CLICKHOUSE_CLIENT -u test_user_02771 -q "SELECT * FROM system.numbers LIMIT 1" -$CLICKHOUSE_CLIENT -u test_user_02771 -q "SELECT * FROM system.numbers LIMIT 1" -$CLICKHOUSE_CLIENT -q "SELECT user FROM system.user_processes" -$CLICKHOUSE_CLIENT -q "SELECT user, toBool(ProfileEvents['SelectQuery'] > 0), toBool(ProfileEvents['Query'] > 0) FROM system.user_processes WHERE user='default'" -$CLICKHOUSE_CLIENT -q "SELECT user, ProfileEvents['SelectQuery'], ProfileEvents['Query'] FROM system.user_processes WHERE user='test_user_02771'" -$CLICKHOUSE_CLIENT -q "DROP USER test_user_02771" +USER_POSTFIX=`random_str 10` +USER="test_user_02771_$USER_POSTFIX" + +$CLICKHOUSE_CLIENT -q "SHOW USER PROCESSES" &>"${CLICKHOUSE_TMP}/test_output" && echo "SHOW USER PROCESSES query succeeded!" || cat "${CLICKHOUSE_TMP}/test_output" +$CLICKHOUSE_CLIENT -q "DROP USER IF EXISTS $USER" +$CLICKHOUSE_CLIENT -q "CREATE USER $USER" +$CLICKHOUSE_CLIENT -u "$USER" -q "SELECT * FROM system.numbers LIMIT 1" +$CLICKHOUSE_CLIENT -u "$USER" -q "SELECT * FROM system.numbers LIMIT 1" +$CLICKHOUSE_CLIENT -q "SELECT user, toBool(ProfileEvents['SelectQuery'] > 0), toBool(ProfileEvents['Query'] > 0) FROM system.user_processes WHERE user='default'" +$CLICKHOUSE_CLIENT -q "SELECT ProfileEvents['SelectQuery'], ProfileEvents['Query'] FROM system.user_processes WHERE user='$USER'" +$CLICKHOUSE_CLIENT -q "DROP USER $USER" From 0d98a46326ca671cb0ff0540972c6eba5280d565 Mon Sep 17 00:00:00 2001 From: pufit Date: Fri, 2 Jun 2023 13:02:18 -0400 Subject: [PATCH 0338/1072] Fix KeyError in cherry-pick --- tests/ci/cherry_pick.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ci/cherry_pick.py b/tests/ci/cherry_pick.py index d36315151aa..07cdcc76c3a 100644 --- a/tests/ci/cherry_pick.py +++ b/tests/ci/cherry_pick.py @@ -71,11 +71,11 @@ This pull-request will be merged automatically as it reaches the mergeable state ### If the PR was closed and then reopened If it stuck, check {pr_url} for `{backport_created_label}` and delete it if \ -necessary. Manually merging will do nothing, since `{label_backports_created}` \ +necessary. Manually merging will do nothing, since `{backport_created_label}` \ prevents the original PR {pr_url} from being processed. If you want to recreate the PR: delete the `{label_cherrypick}` label and delete this branch. -You may also need to delete the `{label_backports_created}` label from the original PR. +You may also need to delete the `{backport_created_label}` label from the original PR. """ BACKPORT_DESCRIPTION = """This pull-request is a last step of an automated \ backporting. From 423afec70542c266187d49cf571d5f6bb4324977 Mon Sep 17 00:00:00 2001 From: tpanetti Date: Fri, 2 Jun 2023 10:05:38 -0700 Subject: [PATCH 0339/1072] Change case and function name for MySQL Compatible types This changes the function name for MySQL Compatible types from getMySQLName to getSQLCompatibleName and changes the casing of the types to upper --- src/DataTypes/DataTypeAggregateFunction.h | 2 +- src/DataTypes/DataTypeArray.h | 4 +- src/DataTypes/DataTypeDate.h | 2 +- src/DataTypes/DataTypeDate32.h | 2 +- src/DataTypes/DataTypeDateTime.h | 2 +- src/DataTypes/DataTypeDateTime64.h | 2 +- src/DataTypes/DataTypeEnum.cpp | 2 +- src/DataTypes/DataTypeEnum.h | 2 +- src/DataTypes/DataTypeFixedString.h | 2 +- src/DataTypes/DataTypeFunction.h | 2 +- src/DataTypes/DataTypeIPv4andIPv6.h | 4 +- src/DataTypes/DataTypeInterval.h | 2 +- src/DataTypes/DataTypeLowCardinality.cpp | 2 +- src/DataTypes/DataTypeLowCardinality.h | 2 +- src/DataTypes/DataTypeMap.h | 2 +- src/DataTypes/DataTypeNothing.h | 2 +- src/DataTypes/DataTypeNullable.h | 2 +- src/DataTypes/DataTypeNumberBase.cpp | 28 +- src/DataTypes/DataTypeNumberBase.h | 2 +- src/DataTypes/DataTypeObject.h | 2 +- src/DataTypes/DataTypeSet.h | 2 +- src/DataTypes/DataTypeString.h | 2 +- src/DataTypes/DataTypeTuple.h | 2 +- src/DataTypes/DataTypeUUID.h | 2 +- src/DataTypes/DataTypesDecimal.h | 4 +- src/DataTypes/IDataType.h | 4 +- ...show_columns_mysql_compatibility.reference | 424 +++++++++--------- .../02775_show_columns_mysql_compatibility.sh | 6 +- 28 files changed, 260 insertions(+), 256 deletions(-) diff --git a/src/DataTypes/DataTypeAggregateFunction.h b/src/DataTypes/DataTypeAggregateFunction.h index 697be13652c..13ca3508580 100644 --- a/src/DataTypes/DataTypeAggregateFunction.h +++ b/src/DataTypes/DataTypeAggregateFunction.h @@ -45,7 +45,7 @@ public: String doGetName() const override; String getNameWithoutVersion() const; const char * getFamilyName() const override { return "AggregateFunction"; } - const char * getMySQLName() const override { return "text"; } + const char * getSQLCompatibleName() const override { return "TEXT"; } TypeIndex getTypeId() const override { return TypeIndex::AggregateFunction; } Array getParameters() const { return parameters; } diff --git a/src/DataTypes/DataTypeArray.h b/src/DataTypes/DataTypeArray.h index b031f411975..528062b60be 100644 --- a/src/DataTypes/DataTypeArray.h +++ b/src/DataTypes/DataTypeArray.h @@ -30,9 +30,9 @@ public: { return "Array"; } - const char * getMySQLName() const override + const char * getSQLCompatibleName() const override { - return "text"; + return "TEXT"; } bool canBeInsideNullable() const override diff --git a/src/DataTypes/DataTypeDate.h b/src/DataTypes/DataTypeDate.h index 33bcb6123ff..7b622ae04a3 100644 --- a/src/DataTypes/DataTypeDate.h +++ b/src/DataTypes/DataTypeDate.h @@ -13,7 +13,7 @@ public: TypeIndex getTypeId() const override { return TypeIndex::Date; } const char * getFamilyName() const override { return family_name; } - const char * getMySQLName() const override { return "date"; } + const char * getSQLCompatibleName() const override { return "DATE"; } bool canBeUsedAsVersion() const override { return true; } bool canBeInsideNullable() const override { return true; } diff --git a/src/DataTypes/DataTypeDate32.h b/src/DataTypes/DataTypeDate32.h index 56315f46e8c..65b0ec7407e 100644 --- a/src/DataTypes/DataTypeDate32.h +++ b/src/DataTypes/DataTypeDate32.h @@ -13,7 +13,7 @@ public: TypeIndex getTypeId() const override { return TypeIndex::Date32; } const char * getFamilyName() const override { return family_name; } - const char * getMySQLName() const override { return "date"; } + const char * getSQLCompatibleName() const override { return "DATE"; } Field getDefault() const override { diff --git a/src/DataTypes/DataTypeDateTime.h b/src/DataTypes/DataTypeDateTime.h index c868f92c311..2facc758f90 100644 --- a/src/DataTypes/DataTypeDateTime.h +++ b/src/DataTypes/DataTypeDateTime.h @@ -36,7 +36,7 @@ public: static constexpr auto family_name = "DateTime"; const char * getFamilyName() const override { return family_name; } - const char * getMySQLName() const override { return "datetime"; } + const char * getSQLCompatibleName() const override { return "DATETIME"; } String doGetName() const override; TypeIndex getTypeId() const override { return TypeIndex::DateTime; } diff --git a/src/DataTypes/DataTypeDateTime64.h b/src/DataTypes/DataTypeDateTime64.h index 8d317bb9430..b836b84918f 100644 --- a/src/DataTypes/DataTypeDateTime64.h +++ b/src/DataTypes/DataTypeDateTime64.h @@ -28,7 +28,7 @@ public: DataTypeDateTime64(UInt32 scale_, const TimezoneMixin & time_zone_info); const char * getFamilyName() const override { return family_name; } - const char * getMySQLName() const override { return "datetime"; } + const char * getSQLCompatibleName() const override { return "DATETIME"; } std::string doGetName() const override; TypeIndex getTypeId() const override { return type_id; } diff --git a/src/DataTypes/DataTypeEnum.cpp b/src/DataTypes/DataTypeEnum.cpp index bfed4d4d5a2..24a3976179d 100644 --- a/src/DataTypes/DataTypeEnum.cpp +++ b/src/DataTypes/DataTypeEnum.cpp @@ -41,7 +41,7 @@ std::string DataTypeEnum::generateMySQLName(const Values & values) { WriteBufferFromOwnString out; - writeString("enum", out); + writeString("ENUM", out); writeChar('(', out); auto first = true; diff --git a/src/DataTypes/DataTypeEnum.h b/src/DataTypes/DataTypeEnum.h index c6e523adf96..2cdaa2db06c 100644 --- a/src/DataTypes/DataTypeEnum.h +++ b/src/DataTypes/DataTypeEnum.h @@ -54,7 +54,7 @@ public: std::string doGetName() const override { return type_name; } const char * getFamilyName() const override; - const char * getMySQLName() const override { return my_sql_type_name.c_str(); } + const char * getSQLCompatibleName() const override { return my_sql_type_name.c_str(); } TypeIndex getTypeId() const override { return type_id; } diff --git a/src/DataTypes/DataTypeFixedString.h b/src/DataTypes/DataTypeFixedString.h index eb09914ec9c..2900efd5a34 100644 --- a/src/DataTypes/DataTypeFixedString.h +++ b/src/DataTypes/DataTypeFixedString.h @@ -42,7 +42,7 @@ public: TypeIndex getTypeId() const override { return type_id; } const char * getFamilyName() const override { return "FixedString"; } - const char * getMySQLName() const override { return "text"; } + const char * getSQLCompatibleName() const override { return "TEXT"; } size_t getN() const { diff --git a/src/DataTypes/DataTypeFunction.h b/src/DataTypes/DataTypeFunction.h index f3423796126..df59f7738b2 100644 --- a/src/DataTypes/DataTypeFunction.h +++ b/src/DataTypes/DataTypeFunction.h @@ -24,7 +24,7 @@ public: std::string doGetName() const override; const char * getFamilyName() const override { return "Function"; } - const char * getMySQLName() const override { return "text"; } + const char * getSQLCompatibleName() const override { return "TEXT"; } TypeIndex getTypeId() const override { return TypeIndex::Function; } const DataTypes & getArgumentTypes() const diff --git a/src/DataTypes/DataTypeIPv4andIPv6.h b/src/DataTypes/DataTypeIPv4andIPv6.h index 8f7fe79793b..be0ebb90f3c 100644 --- a/src/DataTypes/DataTypeIPv4andIPv6.h +++ b/src/DataTypes/DataTypeIPv4andIPv6.h @@ -19,7 +19,7 @@ public: static constexpr auto type_id = TypeToTypeIndex; const char * getFamilyName() const override { return TypeName.data(); } - const char * getMySQLName() const override { return "text"; } + const char * getSQLCompatibleName() const override { return "TEXT"; } TypeIndex getTypeId() const override { return type_id; } @@ -61,7 +61,7 @@ public: static constexpr auto type_id = TypeToTypeIndex; const char * getFamilyName() const override { return TypeName.data(); } - const char * getMySQLName() const override { return "text"; } + const char * getSQLCompatibleName() const override { return "TEXT"; } TypeIndex getTypeId() const override { return type_id; } diff --git a/src/DataTypes/DataTypeInterval.h b/src/DataTypes/DataTypeInterval.h index 69a56e8aadd..ee2157431dd 100644 --- a/src/DataTypes/DataTypeInterval.h +++ b/src/DataTypes/DataTypeInterval.h @@ -26,7 +26,7 @@ public: std::string doGetName() const override { return fmt::format("Interval{}", kind.toString()); } const char * getFamilyName() const override { return "Interval"; } - const char * getMySQLName() const override { return "text"; } + const char * getSQLCompatibleName() const override { return "TEXT"; } TypeIndex getTypeId() const override { return TypeIndex::Interval; } bool equals(const IDataType & rhs) const override; diff --git a/src/DataTypes/DataTypeLowCardinality.cpp b/src/DataTypes/DataTypeLowCardinality.cpp index b1c32317015..e59613e6974 100644 --- a/src/DataTypes/DataTypeLowCardinality.cpp +++ b/src/DataTypes/DataTypeLowCardinality.cpp @@ -29,7 +29,7 @@ namespace ErrorCodes DataTypeLowCardinality::DataTypeLowCardinality(DataTypePtr dictionary_type_) : dictionary_type(std::move(dictionary_type_)), - mysql_name(dictionary_type->getMySQLName()) + mysql_name(dictionary_type->getSQLCompatibleName()) { auto inner_type = dictionary_type; if (dictionary_type->isNullable()) diff --git a/src/DataTypes/DataTypeLowCardinality.h b/src/DataTypes/DataTypeLowCardinality.h index bcc39f58ff7..4dee8565568 100644 --- a/src/DataTypes/DataTypeLowCardinality.h +++ b/src/DataTypes/DataTypeLowCardinality.h @@ -24,7 +24,7 @@ public: return "LowCardinality(" + dictionary_type->getName() + ")"; } const char * getFamilyName() const override { return "LowCardinality"; } - const char * getMySQLName() const override { return mysql_name.c_str(); } + const char * getSQLCompatibleName() const override { return mysql_name.c_str(); } TypeIndex getTypeId() const override { return TypeIndex::LowCardinality; } diff --git a/src/DataTypes/DataTypeMap.h b/src/DataTypes/DataTypeMap.h index 526dc321f44..299119f1759 100644 --- a/src/DataTypes/DataTypeMap.h +++ b/src/DataTypes/DataTypeMap.h @@ -30,7 +30,7 @@ public: TypeIndex getTypeId() const override { return TypeIndex::Map; } std::string doGetName() const override; const char * getFamilyName() const override { return "Map"; } - const char * getMySQLName() const override { return "json"; } + const char * getSQLCompatibleName() const override { return "JSON"; } bool canBeInsideNullable() const override { return false; } diff --git a/src/DataTypes/DataTypeNothing.h b/src/DataTypes/DataTypeNothing.h index fdef6026603..b35ced5dcb3 100644 --- a/src/DataTypes/DataTypeNothing.h +++ b/src/DataTypes/DataTypeNothing.h @@ -16,7 +16,7 @@ public: static constexpr bool is_parametric = false; const char * getFamilyName() const override { return "Nothing"; } - const char * getMySQLName() const override { return "text"; } + const char * getSQLCompatibleName() const override { return "TEXT"; } TypeIndex getTypeId() const override { return TypeIndex::Nothing; } diff --git a/src/DataTypes/DataTypeNullable.h b/src/DataTypes/DataTypeNullable.h index 64b201d32b2..b5fe1bb2dd9 100644 --- a/src/DataTypes/DataTypeNullable.h +++ b/src/DataTypes/DataTypeNullable.h @@ -16,7 +16,7 @@ public: explicit DataTypeNullable(const DataTypePtr & nested_data_type_); std::string doGetName() const override { return "Nullable(" + nested_data_type->getName() + ")"; } const char * getFamilyName() const override { return "Nullable"; } - const char * getMySQLName() const override { return nested_data_type->getMySQLName(); } + const char * getSQLCompatibleName() const override { return nested_data_type->getSQLCompatibleName(); } TypeIndex getTypeId() const override { return TypeIndex::Nullable; } MutableColumnPtr createColumn() const override; diff --git a/src/DataTypes/DataTypeNumberBase.cpp b/src/DataTypes/DataTypeNumberBase.cpp index 7d200de7996..db654448e83 100644 --- a/src/DataTypes/DataTypeNumberBase.cpp +++ b/src/DataTypes/DataTypeNumberBase.cpp @@ -32,20 +32,20 @@ bool DataTypeNumberBase::isValueRepresentedByUnsignedInteger() const template const std::map DataTypeNumberBase::mysqlTypeMap = { - {"UInt8", "tinyint unsigned"}, - {"UInt16", "smallint unsigned"}, - {"UInt32", "mediumint unsigned"}, - {"UInt64", "bigint unsigned"}, - {"UInt128", "text"}, - {"UInt256", "text"}, - {"Int8", "tinyint"}, - {"Int16", "smallint"}, - {"Int32", "int"}, - {"Int64", "bigint"}, - {"Int128", "text"}, - {"Int256", "text"}, - {"Float32", "float"}, - {"Float64", "double"}, + {"UInt8", "TINYINT UNSIGNED"}, + {"UInt16", "SMALLINT UNSIGNED"}, + {"UInt32", "MEDIUMINT UNSIGNEd"}, + {"UInt64", "BIGINT UNSIGNED"}, + {"UInt128", "TEXT"}, + {"UInt256", "TEXT"}, + {"Int8", "TINYINT"}, + {"Int16", "SMALLINT"}, + {"Int32", "INT"}, + {"Int64", "BIGINT"}, + {"Int128", "TEXT"}, + {"Int256", "TEXT"}, + {"Float32", "FLOAT"}, + {"Float64", "DOUBLE"}, }; /// Explicit template instantiations - to avoid code bloat in headers. diff --git a/src/DataTypes/DataTypeNumberBase.h b/src/DataTypes/DataTypeNumberBase.h index b5c963cf245..1a855a974f0 100644 --- a/src/DataTypes/DataTypeNumberBase.h +++ b/src/DataTypes/DataTypeNumberBase.h @@ -27,7 +27,7 @@ public: using ColumnType = ColumnVector; const char * getFamilyName() const override { return TypeName.data(); } - const char * getMySQLName() const override { return mysqlTypeMap.at(TypeName.data()).c_str(); } + const char * getSQLCompatibleName() const override { return mysqlTypeMap.at(TypeName.data()).c_str(); } TypeIndex getTypeId() const override { return TypeToTypeIndex; } Field getDefault() const override; diff --git a/src/DataTypes/DataTypeObject.h b/src/DataTypes/DataTypeObject.h index 8a2c36abcd7..618c7389758 100644 --- a/src/DataTypes/DataTypeObject.h +++ b/src/DataTypes/DataTypeObject.h @@ -23,7 +23,7 @@ public: DataTypeObject(const String & schema_format_, bool is_nullable_); const char * getFamilyName() const override { return "Object"; } - const char * getMySQLName() const override { return "json"; } + const char * getSQLCompatibleName() const override { return "JSON"; } String doGetName() const override; TypeIndex getTypeId() const override { return TypeIndex::Object; } diff --git a/src/DataTypes/DataTypeSet.h b/src/DataTypes/DataTypeSet.h index bdad638b5d5..916b4f071a5 100644 --- a/src/DataTypes/DataTypeSet.h +++ b/src/DataTypes/DataTypeSet.h @@ -15,7 +15,7 @@ class DataTypeSet final : public IDataTypeDummy public: static constexpr bool is_parametric = true; const char * getFamilyName() const override { return "Set"; } - const char * getMySQLName() const override { return "text"; } + const char * getSQLCompatibleName() const override { return "TEXT"; } TypeIndex getTypeId() const override { return TypeIndex::Set; } bool equals(const IDataType & rhs) const override { return typeid(rhs) == typeid(*this); } diff --git a/src/DataTypes/DataTypeString.h b/src/DataTypes/DataTypeString.h index bddfb4ae287..338b3846266 100644 --- a/src/DataTypes/DataTypeString.h +++ b/src/DataTypes/DataTypeString.h @@ -22,7 +22,7 @@ public: } // FIXME: string can contain arbitrary bytes, not only UTF-8 sequences - const char * getMySQLName() const override { return "blob"; } + const char * getSQLCompatibleName() const override { return "BLOB"; } TypeIndex getTypeId() const override { return type_id; } diff --git a/src/DataTypes/DataTypeTuple.h b/src/DataTypes/DataTypeTuple.h index d264cc97f60..93fa87b1332 100644 --- a/src/DataTypes/DataTypeTuple.h +++ b/src/DataTypes/DataTypeTuple.h @@ -33,7 +33,7 @@ public: TypeIndex getTypeId() const override { return TypeIndex::Tuple; } std::string doGetName() const override; const char * getFamilyName() const override { return "Tuple"; } - const char * getMySQLName() const override { return "json"; } + const char * getSQLCompatibleName() const override { return "JSON"; } bool canBeInsideNullable() const override { return false; } bool supportsSparseSerialization() const override { return true; } diff --git a/src/DataTypes/DataTypeUUID.h b/src/DataTypes/DataTypeUUID.h index 4d54db42b45..bbf35074df3 100644 --- a/src/DataTypes/DataTypeUUID.h +++ b/src/DataTypes/DataTypeUUID.h @@ -18,7 +18,7 @@ public: static constexpr auto type_id = TypeIndex::UUID; const char * getFamilyName() const override { return "UUID"; } - const char * getMySQLName() const override { return "char"; } + const char * getSQLCompatibleName() const override { return "CHAR"; } TypeIndex getTypeId() const override { return type_id; } diff --git a/src/DataTypes/DataTypesDecimal.h b/src/DataTypes/DataTypesDecimal.h index 5c9405cb060..6f3bf582aeb 100644 --- a/src/DataTypes/DataTypesDecimal.h +++ b/src/DataTypes/DataTypesDecimal.h @@ -37,10 +37,10 @@ public: using Base::Base; static constexpr auto family_name = "Decimal"; - static constexpr auto mysql_name = "decimal"; + static constexpr auto mysql_name = "DECIMAL"; const char * getFamilyName() const override { return family_name; } - const char * getMySQLName() const override { return mysql_name; } + const char * getSQLCompatibleName() const override { return mysql_name; } std::string doGetName() const override; TypeIndex getTypeId() const override { return TypeToTypeIndex; } diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index 2bed18897ce..93fdbab05ef 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -83,7 +83,7 @@ public: /// Name of data type family (example: FixedString, Array). virtual const char * getFamilyName() const = 0; - virtual const char * getMySQLName() const = 0; + virtual const char * getSQLCompatibleName() const = 0; /// Data type id. It's used for runtime type checks. virtual TypeIndex getTypeId() const = 0; @@ -135,7 +135,7 @@ public: protected: virtual String doGetName() const { return getFamilyName(); } - virtual String doGetMySQLName() const { return getMySQLName(); } + virtual String doGetMySQLName() const { return getSQLCompatibleName(); } virtual SerializationPtr doGetDefaultSerialization() const = 0; public: diff --git a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.reference b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.reference index 96e542611c6..1742cd9c90c 100644 --- a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.reference +++ b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.reference @@ -4,226 +4,226 @@ Create pseudo-random database name Create tab duplicate table Run MySQL test field type null key default extra -aggregate_function text 0 NULL -array_value text 0 NULL -boolean_value tinyint unsigned 0 NULL -date32_value date 0 NULL -date_value date 0 NULL -datetime64_value datetime 0 NULL -datetime_value datetime 0 NULL -decimal_value decimal 0 NULL -enum_value enum('apple', 'banana', 'orange') 0 NULL -fixed_string_value text 0 NULL -float32 float 0 NULL -float64 double 0 NULL -int32 int 0 NULL -ipv4_value text 0 NULL -ipv6_value text 0 NULL -json_value json 0 NULL -low_cardinality blob 0 NULL -low_cardinality_date datetime 0 NULL -map_value json 0 NULL -nested.nested_int text 0 NULL -nested.nested_string text 0 NULL -nullable_value int 0 NULL -string_value blob 0 NULL -tuple_value json 0 NULL -uint64 bigint unsigned 0 PRI SOR NULL -uuid_value char 0 NULL +aggregate_function TEXT 0 NULL +array_value TEXT 0 NULL +boolean_value TINYINT UNSIGNED 0 NULL +date32_value DATE 0 NULL +date_value DATE 0 NULL +datetime64_value DATETIME 0 NULL +datetime_value DATETIME 0 NULL +decimal_value DECIMAL 0 NULL +enum_value ENUM('apple', 'banana', 'orange') 0 NULL +fixed_string_value TEXT 0 NULL +float32 FLOAT 0 NULL +float64 DOUBLE 0 NULL +int32 INT 0 NULL +ipv4_value TEXT 0 NULL +ipv6_value TEXT 0 NULL +json_value JSON 0 NULL +low_cardinality BLOB 0 NULL +low_cardinality_date DATETIME 0 NULL +map_value JSON 0 NULL +nested.nested_int TEXT 0 NULL +nested.nested_string TEXT 0 NULL +nullable_value INT 0 NULL +string_value BLOB 0 NULL +tuple_value JSON 0 NULL +uint64 BIGINT UNSIGNED 0 PRI SOR NULL +uuid_value CHAR 0 NULL field type null key default extra -aggregate_function text 0 NULL -array_value text 0 NULL -boolean_value tinyint unsigned 0 NULL -date32_value date 0 NULL -date_value date 0 NULL -datetime64_value datetime 0 NULL -datetime_value datetime 0 NULL -decimal_value decimal 0 NULL -enum_value enum('apple', 'banana', 'orange') 0 NULL -fixed_string_value text 0 NULL -float32 float 0 NULL -float64 double 0 NULL -int32 int 0 NULL -ipv4_value text 0 NULL -ipv6_value text 0 NULL -json_value json 0 NULL -low_cardinality blob 0 NULL -low_cardinality_date datetime 0 NULL -map_value json 0 NULL -nested.nested_int text 0 NULL -nested.nested_string text 0 NULL -nullable_value int 0 NULL -string_value blob 0 NULL -tuple_value json 0 NULL -uint64 bigint unsigned 0 PRI SOR NULL -uuid_value char 0 NULL +aggregate_function TEXT 0 NULL +array_value TEXT 0 NULL +boolean_value TINYINT UNSIGNED 0 NULL +date32_value DATE 0 NULL +date_value DATE 0 NULL +datetime64_value DATETIME 0 NULL +datetime_value DATETIME 0 NULL +decimal_value DECIMAL 0 NULL +enum_value ENUM('apple', 'banana', 'orange') 0 NULL +fixed_string_value TEXT 0 NULL +float32 FLOAT 0 NULL +float64 DOUBLE 0 NULL +int32 INT 0 NULL +ipv4_value TEXT 0 NULL +ipv6_value TEXT 0 NULL +json_value JSON 0 NULL +low_cardinality BLOB 0 NULL +low_cardinality_date DATETIME 0 NULL +map_value JSON 0 NULL +nested.nested_int TEXT 0 NULL +nested.nested_string TEXT 0 NULL +nullable_value INT 0 NULL +string_value BLOB 0 NULL +tuple_value JSON 0 NULL +uint64 BIGINT UNSIGNED 0 PRI SOR NULL +uuid_value CHAR 0 NULL field type null key default extra collation comment privileges -aggregate_function text 0 NULL NULL -array_value text 0 NULL NULL -boolean_value tinyint unsigned 0 NULL NULL -date32_value date 0 NULL NULL -date_value date 0 NULL NULL -datetime64_value datetime 0 NULL NULL -datetime_value datetime 0 NULL NULL -decimal_value decimal 0 NULL NULL -enum_value enum('apple', 'banana', 'orange') 0 NULL NULL -fixed_string_value text 0 NULL NULL -float32 float 0 NULL NULL -float64 double 0 NULL NULL -int32 int 0 NULL NULL -ipv4_value text 0 NULL NULL -ipv6_value text 0 NULL NULL -json_value json 0 NULL NULL -low_cardinality blob 0 NULL NULL -low_cardinality_date datetime 0 NULL NULL -map_value json 0 NULL NULL -nested.nested_int text 0 NULL NULL -nested.nested_string text 0 NULL NULL -nullable_value int 0 NULL NULL -string_value blob 0 NULL NULL -tuple_value json 0 NULL NULL -uint64 bigint unsigned 0 PRI SOR NULL NULL -uuid_value char 0 NULL NULL +aggregate_function TEXT 0 NULL NULL +array_value TEXT 0 NULL NULL +boolean_value TINYINT UNSIGNED 0 NULL NULL +date32_value DATE 0 NULL NULL +date_value DATE 0 NULL NULL +datetime64_value DATETIME 0 NULL NULL +datetime_value DATETIME 0 NULL NULL +decimal_value DECIMAL 0 NULL NULL +enum_value ENUM('apple', 'banana', 'orange') 0 NULL NULL +fixed_string_value TEXT 0 NULL NULL +float32 FLOAT 0 NULL NULL +float64 DOUBLE 0 NULL NULL +int32 INT 0 NULL NULL +ipv4_value TEXT 0 NULL NULL +ipv6_value TEXT 0 NULL NULL +json_value JSON 0 NULL NULL +low_cardinality BLOB 0 NULL NULL +low_cardinality_date DATETIME 0 NULL NULL +map_value JSON 0 NULL NULL +nested.nested_int TEXT 0 NULL NULL +nested.nested_string TEXT 0 NULL NULL +nullable_value INT 0 NULL NULL +string_value BLOB 0 NULL NULL +tuple_value JSON 0 NULL NULL +uint64 BIGINT UNSIGNED 0 PRI SOR NULL NULL +uuid_value CHAR 0 NULL NULL field type null key default extra -int32 int 0 NULL -nested.nested_int text 0 NULL -uint64 bigint unsigned 0 PRI SOR NULL +int32 INT 0 NULL +nested.nested_int TEXT 0 NULL +uint64 BIGINT UNSIGNED 0 PRI SOR NULL field type null key default extra -aggregate_function text 0 NULL -array_value text 0 NULL -boolean_value tinyint unsigned 0 NULL -date32_value date 0 NULL -date_value date 0 NULL -datetime64_value datetime 0 NULL -datetime_value datetime 0 NULL -decimal_value decimal 0 NULL -enum_value enum('apple', 'banana', 'orange') 0 NULL -fixed_string_value text 0 NULL -float32 float 0 NULL -float64 double 0 NULL -ipv4_value text 0 NULL -ipv6_value text 0 NULL -json_value json 0 NULL -low_cardinality blob 0 NULL -low_cardinality_date datetime 0 NULL -map_value json 0 NULL -nested.nested_string text 0 NULL -nullable_value int 0 NULL -string_value blob 0 NULL -tuple_value json 0 NULL -uuid_value char 0 NULL +aggregate_function TEXT 0 NULL +array_value TEXT 0 NULL +boolean_value TINYINT UNSIGNED 0 NULL +date32_value DATE 0 NULL +date_value DATE 0 NULL +datetime64_value DATETIME 0 NULL +datetime_value DATETIME 0 NULL +decimal_value DECIMAL 0 NULL +enum_value ENUM('apple', 'banana', 'orange') 0 NULL +fixed_string_value TEXT 0 NULL +float32 FLOAT 0 NULL +float64 DOUBLE 0 NULL +ipv4_value TEXT 0 NULL +ipv6_value TEXT 0 NULL +json_value JSON 0 NULL +low_cardinality BLOB 0 NULL +low_cardinality_date DATETIME 0 NULL +map_value JSON 0 NULL +nested.nested_string TEXT 0 NULL +nullable_value INT 0 NULL +string_value BLOB 0 NULL +tuple_value JSON 0 NULL +uuid_value CHAR 0 NULL field type null key default extra -int32 int 0 NULL -nested.nested_int text 0 NULL -uint64 bigint unsigned 0 PRI SOR NULL +int32 INT 0 NULL +nested.nested_int TEXT 0 NULL +uint64 BIGINT UNSIGNED 0 PRI SOR NULL field type null key default extra -aggregate_function text 0 NULL -array_value text 0 NULL -boolean_value tinyint unsigned 0 NULL -date32_value date 0 NULL -date_value date 0 NULL -datetime64_value datetime 0 NULL -datetime_value datetime 0 NULL -decimal_value decimal 0 NULL -enum_value enum('apple', 'banana', 'orange') 0 NULL -fixed_string_value text 0 NULL -float32 float 0 NULL -float64 double 0 NULL -ipv4_value text 0 NULL -ipv6_value text 0 NULL -json_value json 0 NULL -low_cardinality blob 0 NULL -low_cardinality_date datetime 0 NULL -map_value json 0 NULL -nested.nested_string text 0 NULL -nullable_value int 0 NULL -string_value blob 0 NULL -tuple_value json 0 NULL -uuid_value char 0 NULL +aggregate_function TEXT 0 NULL +array_value TEXT 0 NULL +boolean_value TINYINT UNSIGNED 0 NULL +date32_value DATE 0 NULL +date_value DATE 0 NULL +datetime64_value DATETIME 0 NULL +datetime_value DATETIME 0 NULL +decimal_value DECIMAL 0 NULL +enum_value ENUM('apple', 'banana', 'orange') 0 NULL +fixed_string_value TEXT 0 NULL +float32 FLOAT 0 NULL +float64 DOUBLE 0 NULL +ipv4_value TEXT 0 NULL +ipv6_value TEXT 0 NULL +json_value JSON 0 NULL +low_cardinality BLOB 0 NULL +low_cardinality_date DATETIME 0 NULL +map_value JSON 0 NULL +nested.nested_string TEXT 0 NULL +nullable_value INT 0 NULL +string_value BLOB 0 NULL +tuple_value JSON 0 NULL +uuid_value CHAR 0 NULL field type null key default extra -int32 int 0 NULL -nested.nested_int text 0 NULL -uint64 bigint unsigned 0 PRI SOR NULL +int32 INT 0 NULL +nested.nested_int TEXT 0 NULL +uint64 BIGINT UNSIGNED 0 PRI SOR NULL field type null key default extra -aggregate_function text 0 NULL +aggregate_function TEXT 0 NULL field type null key default extra -aggregate_function text 0 NULL -array_value text 0 NULL -boolean_value tinyint unsigned 0 NULL -date32_value date 0 NULL -date_value date 0 NULL -datetime64_value datetime 0 NULL -datetime_value datetime 0 NULL -decimal_value decimal 0 NULL -enum_value enum('apple', 'banana', 'orange') 0 NULL -fixed_string_value text 0 NULL -float32 float 0 NULL -float64 double 0 NULL -int32 int 0 NULL -ipv4_value text 0 NULL -ipv6_value text 0 NULL -json_value json 0 NULL -low_cardinality blob 0 NULL -low_cardinality_date datetime 0 NULL -map_value json 0 NULL -nested.nested_int text 0 NULL -nested.nested_string text 0 NULL -nullable_value int 0 NULL -string_value blob 0 NULL -tuple_value json 0 NULL -uint64 bigint unsigned 0 PRI SOR NULL -uuid_value char 0 NULL +aggregate_function TEXT 0 NULL +array_value TEXT 0 NULL +boolean_value TINYINT UNSIGNED 0 NULL +date32_value DATE 0 NULL +date_value DATE 0 NULL +datetime64_value DATETIME 0 NULL +datetime_value DATETIME 0 NULL +decimal_value DECIMAL 0 NULL +enum_value ENUM('apple', 'banana', 'orange') 0 NULL +fixed_string_value TEXT 0 NULL +float32 FLOAT 0 NULL +float64 DOUBLE 0 NULL +int32 INT 0 NULL +ipv4_value TEXT 0 NULL +ipv6_value TEXT 0 NULL +json_value JSON 0 NULL +low_cardinality BLOB 0 NULL +low_cardinality_date DATETIME 0 NULL +map_value JSON 0 NULL +nested.nested_int TEXT 0 NULL +nested.nested_string TEXT 0 NULL +nullable_value INT 0 NULL +string_value BLOB 0 NULL +tuple_value JSON 0 NULL +uint64 BIGINT UNSIGNED 0 PRI SOR NULL +uuid_value CHAR 0 NULL field type null key default extra -aggregate_function text 0 NULL -array_value text 0 NULL -boolean_value tinyint unsigned 0 NULL -date32_value date 0 NULL -date_value date 0 NULL -datetime64_value datetime 0 NULL -datetime_value datetime 0 NULL -decimal_value decimal 0 NULL -enum_value enum('apple', 'banana', 'orange') 0 NULL -fixed_string_value text 0 NULL -float32 float 0 NULL -float64 double 0 NULL -int32 int 0 NULL -ipv4_value text 0 NULL -ipv6_value text 0 NULL -json_value json 0 NULL -low_cardinality blob 0 NULL -low_cardinality_date datetime 0 NULL -map_value json 0 NULL -nested.nested_int text 0 NULL -nested.nested_string text 0 NULL -nullable_value int 0 NULL -string_value blob 0 NULL -tuple_value json 0 NULL -uint64 bigint unsigned 0 PRI SOR NULL -uuid_value char 0 NULL +aggregate_function TEXT 0 NULL +array_value TEXT 0 NULL +boolean_value TINYINT UNSIGNED 0 NULL +date32_value DATE 0 NULL +date_value DATE 0 NULL +datetime64_value DATETIME 0 NULL +datetime_value DATETIME 0 NULL +decimal_value DECIMAL 0 NULL +enum_value ENUM('apple', 'banana', 'orange') 0 NULL +fixed_string_value TEXT 0 NULL +float32 FLOAT 0 NULL +float64 DOUBLE 0 NULL +int32 INT 0 NULL +ipv4_value TEXT 0 NULL +ipv6_value TEXT 0 NULL +json_value JSON 0 NULL +low_cardinality BLOB 0 NULL +low_cardinality_date DATETIME 0 NULL +map_value JSON 0 NULL +nested.nested_int TEXT 0 NULL +nested.nested_string TEXT 0 NULL +nullable_value INT 0 NULL +string_value BLOB 0 NULL +tuple_value JSON 0 NULL +uint64 BIGINT UNSIGNED 0 PRI SOR NULL +uuid_value CHAR 0 NULL field type null key default extra -aggregate_function text 0 NULL -array_value text 0 NULL -boolean_value tinyint unsigned 0 NULL -date32_value date 0 NULL -date_value date 0 NULL -datetime64_value datetime 0 NULL -datetime_value datetime 0 NULL -decimal_value decimal 0 NULL -enum_value enum('apple', 'banana', 'orange') 0 NULL -fixed_string_value text 0 NULL -float32 float 0 NULL -float64 double 0 NULL -int32 int 0 NULL -ipv4_value text 0 NULL -ipv6_value text 0 NULL -json_value json 0 NULL -low_cardinality blob 0 NULL -low_cardinality_date datetime 0 NULL -map_value json 0 NULL -nested.nested_int text 0 NULL -nested.nested_string text 0 NULL -nullable_value int 0 NULL -string_value blob 0 NULL -tuple_value json 0 NULL -uint64 bigint unsigned 0 PRI SOR NULL -uuid_value char 0 NULL +aggregate_function TEXT 0 NULL +array_value TEXT 0 NULL +boolean_value TINYINT UNSIGNED 0 NULL +date32_value DATE 0 NULL +date_value DATE 0 NULL +datetime64_value DATETIME 0 NULL +datetime_value DATETIME 0 NULL +decimal_value DECIMAL 0 NULL +enum_value ENUM('apple', 'banana', 'orange') 0 NULL +fixed_string_value TEXT 0 NULL +float32 FLOAT 0 NULL +float64 DOUBLE 0 NULL +int32 INT 0 NULL +ipv4_value TEXT 0 NULL +ipv6_value TEXT 0 NULL +json_value JSON 0 NULL +low_cardinality BLOB 0 NULL +low_cardinality_date DATETIME 0 NULL +map_value JSON 0 NULL +nested.nested_int TEXT 0 NULL +nested.nested_string TEXT 0 NULL +nullable_value INT 0 NULL +string_value BLOB 0 NULL +tuple_value JSON 0 NULL +uint64 BIGINT UNSIGNED 0 PRI SOR NULL +uuid_value CHAR 0 NULL diff --git a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh index a446c6e817e..fd1ad92f060 100755 --- a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh +++ b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh @@ -1,4 +1,8 @@ -#!/bin/bash +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh # This script tests the MySQL compatibility of the SHOW COLUMNS command in ClickHouse USER="default" From 09b5b0c3f7a1265e5b21f2a818ec05c9afdc48e4 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Fri, 2 Jun 2023 13:31:02 -0400 Subject: [PATCH 0340/1072] add word --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 2feb7981fcc..0455556ae96 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1453,6 +1453,7 @@ gRPC gccMurmurHash gcem generateRandom +generateRandomStructure generateULID generateUUIDv geoDistance From 85ded501d798a067fd9d3b1bdd0e2d6d8cbcc14b Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 2 Jun 2023 20:02:14 +0200 Subject: [PATCH 0341/1072] Update aspell-dict.txt --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 2feb7981fcc..0455556ae96 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1453,6 +1453,7 @@ gRPC gccMurmurHash gcem generateRandom +generateRandomStructure generateULID generateUUIDv geoDistance From a3ed86a52879367308f4425bef5617f98486a1bb Mon Sep 17 00:00:00 2001 From: pufit Date: Fri, 2 Jun 2023 14:36:44 -0400 Subject: [PATCH 0342/1072] Documentation --- .../utilities/clickhouse-keeper-client.md | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 docs/en/operations/utilities/clickhouse-keeper-client.md diff --git a/docs/en/operations/utilities/clickhouse-keeper-client.md b/docs/en/operations/utilities/clickhouse-keeper-client.md new file mode 100644 index 00000000000..77f816fe428 --- /dev/null +++ b/docs/en/operations/utilities/clickhouse-keeper-client.md @@ -0,0 +1,53 @@ +--- +slug: /en/operations/utilities/clickhouse-keeper-client +sidebar_label: clickhouse-keeper-client +--- + +# clickhouse-keeper-client + +A client application to interact with clickhouse-keeper by its native protocol. + +## Keys {#clickhouse-keeper-client} + +- `-q QUERY`, `--query=QUERY` — Query to execute. If this parameter is not passed, `clickhouse-keeper-client` will start in interactive mode. +- `-h HOST`, `--host=HOST` — Server host. Default value: `localhost`. +- `-p N`, `--port=N` — Server port. Default value: 2181 +- `--connection-timeout=TIMEOUT` — Set connection timeout in seconds. Default value: 10s. +- `--session-timeout=TIMEOUT` — Set session timeout in seconds. Default value: 10s. +- `--operation-timeout=TIMEOUT` — Set operation timeout in seconds. Default value: 10s. +- `--history-file=FILE_PATH` — Set path of history file. Default value: `~/.keeper-client-history`. +- `--help` — Shows the help message. + +## Example {#clickhouse-keeper-client-example} + +```bash +./clickhouse-keeper-client -h localhost:2181 --connection-timeout 30 --session-timeout 30 --operation-timeout 30 +Connected to ZooKeeper at [::1]:2181 with session_id 137 +/ :) ls +keeper foo bar +/ :) cd keeper +/keeper :) ls +api_version +/keeper :) cd api_version +/keeper/api_version :) ls + +/keeper/api_version :) cd xyz +Path /keeper/api_version/xyz does not exists +/keeper/api_version :) cd ../../ +/ :) ls +keeper foo bar +/ :) get keeper/api_version +2 +``` + +## Commands {#clickhouse-keeper-client-commands} + +- `ls [path]` -- Lists the nodes for the given path (default: cwd) +- `cd [path]` -- Change the working path (default `.`) +- `set [version]` -- Updates the node's value. Only update if version matches (default: -1) +- `create ` -- Creates new node +- `get ` -- Returns the node's value +- `remove ` -- Remove the node +- `rmr ` -- Recursively deletes path. Confirmation required +- `flwc ` -- Executes four-letter-word command +- `help` -- Prints this message From 991d1b97fc1b0959d1cb1659ee46a893b693716f Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Fri, 2 Jun 2023 20:48:31 +0200 Subject: [PATCH 0343/1072] less traces in logs --- src/IO/WriteBufferFromS3.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 462cf2674c3..210cea02a36 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -245,10 +245,8 @@ WriteBufferFromS3::~WriteBufferFromS3() LOG_INFO(log, "WriteBufferFromS3 is not finalized in destructor. " "It could be if an exception occurs. File is not written to S3. " - "{}. " - "Stack trace: {}", - getLogDetails(), - StackTrace().toString()); + "{}.", + getLogDetails()); } task_tracker->safeWaitAll(); From 50654435dc1cf6ac826d08d28adf2e669250d5ec Mon Sep 17 00:00:00 2001 From: ltrk2 <107155950+ltrk2@users.noreply.github.com> Date: Fri, 2 Jun 2023 19:36:37 +0000 Subject: [PATCH 0344/1072] Implement endianness-independent serialization for UUID --- .../Serializations/SerializationUUID.cpp | 16 ++--- src/IO/ReadHelpers.cpp | 72 +++++++++---------- src/IO/ReadHelpers.h | 15 ++-- src/IO/WriteHelpers.cpp | 38 +++++++--- src/IO/WriteHelpers.h | 7 +- .../Formats/Impl/AvroRowInputFormat.cpp | 3 +- .../Formats/Impl/AvroRowOutputFormat.cpp | 5 +- 7 files changed, 75 insertions(+), 81 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationUUID.cpp b/src/DataTypes/Serializations/SerializationUUID.cpp index ee1327ef094..13313111b2b 100644 --- a/src/DataTypes/Serializations/SerializationUUID.cpp +++ b/src/DataTypes/Serializations/SerializationUUID.cpp @@ -51,19 +51,11 @@ void SerializationUUID::deserializeTextQuoted(IColumn & column, ReadBuffer & ist { assertChar('\'', istr); char * next_pos = find_first_symbols<'\\', '\''>(istr.position(), istr.buffer().end()); - size_t len = next_pos - istr.position(); - if ((len == 32) && (istr.position()[32] == '\'')) + const auto len = next_pos - istr.position(); + if ((len == 32 || len == 36) && istr.position()[len] == '\'') { - parseUUIDWithoutSeparator( - reinterpret_cast(istr.position()), std::reverse_iterator(reinterpret_cast(&uuid) + 16)); - istr.ignore(33); - fast = true; - } - else if ((len == 36) && (istr.position()[36] == '\'')) - { - parseUUID( - reinterpret_cast(istr.position()), std::reverse_iterator(reinterpret_cast(&uuid) + 16)); - istr.ignore(37); + uuid = parseUUID(std::span(reinterpret_cast(istr.position()), len)); + istr.ignore(len + 1); fast = true; } else diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 99d25ee6613..a85a057f2b3 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -46,48 +46,40 @@ inline void parseHex(IteratorSrc src, IteratorDst dst) dst[dst_pos] = unhex2(reinterpret_cast(&src[src_pos])); } -void parseUUID(const UInt8 * src36, UInt8 * dst16) +UUID parseUUID(std::span src) { - /// If string is not like UUID - implementation specific behaviour. + UUID uuid; + const auto * src_ptr = src.data(); + auto * dst = reinterpret_cast(&uuid); + if (const auto size = src.size(); size == 36) + { +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + parseHex<4>(src_ptr, dst); + parseHex<2>(src_ptr + 9, dst + 4); + parseHex<2>(src_ptr + 14, dst + 6); + parseHex<2>(src_ptr + 19, dst + 8); + parseHex<6>(src_ptr + 24, dst + 10); +#else + const std::reverse_iterator dst_it(dst + sizeof(UUID)); + /// FIXME This code looks like trash. + parseHex<4>(src_ptr, dst + 8); + parseHex<2>(src_ptr + 9, dst + 12); + parseHex<2>(src_ptr + 14, dst + 14); + parseHex<2>(src_ptr + 19, dst); + parseHex<6>(src_ptr + 24, dst + 2); +#endif + } + else if (size == 32) + { +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + parseHex<16>(src_ptr, dst); +#else + parseHex<8>(src_ptr, dst + 8); + parseHex<8>(src_ptr + 16, dst); +#endif + } - parseHex<4>(&src36[0], &dst16[0]); - parseHex<2>(&src36[9], &dst16[4]); - parseHex<2>(&src36[14], &dst16[6]); - parseHex<2>(&src36[19], &dst16[8]); - parseHex<6>(&src36[24], &dst16[10]); -} - -void parseUUIDWithoutSeparator(const UInt8 * src36, UInt8 * dst16) -{ - /// If string is not like UUID - implementation specific behaviour. - - parseHex<16>(&src36[0], &dst16[0]); -} - -/** Function used when byte ordering is important when parsing uuid - * ex: When we create an UUID type - */ -void parseUUID(const UInt8 * src36, std::reverse_iterator dst16) -{ - /// If string is not like UUID - implementation specific behaviour. - - /// FIXME This code looks like trash. - parseHex<4>(&src36[0], dst16 + 8); - parseHex<2>(&src36[9], dst16 + 12); - parseHex<2>(&src36[14], dst16 + 14); - parseHex<2>(&src36[19], dst16); - parseHex<6>(&src36[24], dst16 + 2); -} - -/** Function used when byte ordering is important when parsing uuid - * ex: When we create an UUID type - */ -void parseUUIDWithoutSeparator(const UInt8 * src36, std::reverse_iterator dst16) -{ - /// If string is not like UUID - implementation specific behaviour. - - parseHex<8>(&src36[0], dst16 + 8); - parseHex<8>(&src36[16], dst16); + return uuid; } void NO_INLINE throwAtAssertionFailed(const char * s, ReadBuffer & buf) diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 32338552b66..7e293944d19 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -623,12 +624,6 @@ struct NullOutput void push_back(char) {} /// NOLINT }; -void parseUUID(const UInt8 * src36, UInt8 * dst16); -void parseUUIDWithoutSeparator(const UInt8 * src36, UInt8 * dst16); -void parseUUID(const UInt8 * src36, std::reverse_iterator dst16); -void parseUUIDWithoutSeparator(const UInt8 * src36, std::reverse_iterator dst16); - - template ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf); @@ -770,6 +765,9 @@ inline bool tryReadDateText(ExtendedDayNum & date, ReadBuffer & buf) return readDateTextImpl(date, buf); } +/// If string is not like UUID - implementation specific behaviour. +UUID parseUUID(std::span src); + template inline ReturnType readUUIDTextImpl(UUID & uuid, ReadBuffer & buf) { @@ -797,12 +795,9 @@ inline ReturnType readUUIDTextImpl(UUID & uuid, ReadBuffer & buf) return ReturnType(false); } } - - parseUUID(reinterpret_cast(s), std::reverse_iterator(reinterpret_cast(&uuid) + 16)); } - else - parseUUIDWithoutSeparator(reinterpret_cast(s), std::reverse_iterator(reinterpret_cast(&uuid) + 16)); + uuid = parseUUID({reinterpret_cast(s), size}); return ReturnType(true); } else diff --git a/src/IO/WriteHelpers.cpp b/src/IO/WriteHelpers.cpp index a0eceddc6f6..6023d4c9d5b 100644 --- a/src/IO/WriteHelpers.cpp +++ b/src/IO/WriteHelpers.cpp @@ -23,17 +23,35 @@ void formatHex(IteratorSrc src, IteratorDst dst, size_t num_bytes) /** Function used when byte ordering is important when parsing uuid * ex: When we create an UUID type */ -void formatUUID(std::reverse_iterator src16, UInt8 * dst36) +std::array formatUUID(const UUID & uuid) { - formatHex(src16 + 8, &dst36[0], 4); - dst36[8] = '-'; - formatHex(src16 + 12, &dst36[9], 2); - dst36[13] = '-'; - formatHex(src16 + 14, &dst36[14], 2); - dst36[18] = '-'; - formatHex(src16, &dst36[19], 2); - dst36[23] = '-'; - formatHex(src16 + 2, &dst36[24], 6); + std::array dst; + const auto * src_ptr = reinterpret_cast(&uuid); + auto * dst_ptr = dst.data(); +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + formatHex(src_ptr, dst_ptr, 4); + dst[8] = '-'; + formatHex(src_ptr + 4, dst_ptr + 9, 2); + dst[13] = '-'; + formatHex(src_ptr + 6, dst_ptr + 14, 2); + dst[18] = '-'; + formatHex(src_ptr + 8, dst_ptr + 19, 2); + dst[23] = '-'; + formatHex(src_ptr + 10, dst_ptr + 24, 6); +#else + const std::reverse_iterator src_it(src_ptr + 16); + formatHex(src_it + 8, dst_ptr, 4); + dst[8] = '-'; + formatHex(src_it + 12, dst_ptr + 9, 2); + dst[13] = '-'; + formatHex(src_it + 14, dst_ptr + 14, 2); + dst[18] = '-'; + formatHex(src_it, dst_ptr + 19, 2); + dst[23] = '-'; + formatHex(src_it + 2, dst_ptr + 24, 6); +#endif + + return dst; } void writeIPv4Text(const IPv4 & ip, WriteBuffer & buf) diff --git a/src/IO/WriteHelpers.h b/src/IO/WriteHelpers.h index cdbc952690c..923684c4249 100644 --- a/src/IO/WriteHelpers.h +++ b/src/IO/WriteHelpers.h @@ -625,13 +625,12 @@ inline void writeXMLStringForTextElement(std::string_view s, WriteBuffer & buf) writeXMLStringForTextElement(s.data(), s.data() + s.size(), buf); } -void formatUUID(std::reverse_iterator src16, UInt8 * dst36); +std::array formatUUID(const UUID & uuid); inline void writeUUIDText(const UUID & uuid, WriteBuffer & buf) { - char s[36]; - formatUUID(std::reverse_iterator(reinterpret_cast(&uuid) + 16), reinterpret_cast(s)); - buf.write(s, sizeof(s)); + const auto text = formatUUID(uuid); + buf.write(text.data(), text.size()); } void writeIPv4Text(const IPv4 & ip, WriteBuffer & buf); diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index c2602a4d1d5..974b198a483 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -256,8 +256,7 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(const avro if (tmp.length() != 36) throw ParsingException(ErrorCodes::CANNOT_PARSE_UUID, "Cannot parse uuid {}", tmp); - UUID uuid; - parseUUID(reinterpret_cast(tmp.data()), std::reverse_iterator(reinterpret_cast(&uuid) + 16)); + const auto uuid = parseUUID({reinterpret_cast(tmp.data()), tmp.length()}); assert_cast(column).insertValue(uuid); return true; }; diff --git a/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp index c743b2c1766..2b163164d56 100644 --- a/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -329,9 +329,8 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF return {schema, [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { const auto & uuid = assert_cast(column).getElement(row_num); - std::array s; - formatUUID(std::reverse_iterator(reinterpret_cast(&uuid) + 16), s.data()); - encoder.encodeBytes(reinterpret_cast(s.data()), s.size()); + const auto text = formatUUID(uuid); + encoder.encodeBytes(reinterpret_cast(text.data()), text.size()); }}; } case TypeIndex::Array: From 87eaaa0f7bf43a7145c24e726af8b3b912f38eea Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Fri, 2 Jun 2023 16:30:18 -0400 Subject: [PATCH 0345/1072] address review comments --- .../table-engines/integrations/embedded-rocksdb.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md index dab741a9f63..6664b6a4613 100644 --- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md +++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md @@ -127,15 +127,17 @@ A special `direct` join with EmbeddedRocksDB tables is supported. This direct join avoids forming a hash table in memory and accesses the data directly from the EmbeddedRocksDB. +With large joins you may see much lower memory usage with direct joins +because the hash table is not created. + To enable direct joins: ```sql -SET join_algorithm = 'direct' +SET join_algorithm = 'direct, hash' ``` :::tip -When the `join_algorithm` is set to `direct`, direct joins will be used -when possible. However, direct joins are not used for RIGHT or FULL JOINs. -ClickHouse will choose another join algorithm when direct joins are not possible. +When the `join_algorithm` is set to `direct, hash`, direct joins will be used +when possible, and hash otherwise. ::: #### Example @@ -205,3 +207,6 @@ ORDER BY key ASC └─────┴─────────┴────────┴────────┘ ``` +### More information on Joins +- [`join_algorithm` setting](/docs/en/operations/settings/settings.md#settings-join_algorithm) +- [JOIN clause](/docs/en/sql-reference/statements/select/join.md) From 4506299d73a3fbf8fc9446b3eed05fe4d5553c23 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Tue, 30 May 2023 20:53:45 +0200 Subject: [PATCH 0346/1072] impl --- docker/test/performance-comparison/report.py | 4 +++- tests/ci/performance_comparison_check.py | 8 +++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docker/test/performance-comparison/report.py b/docker/test/performance-comparison/report.py index 214f2d550b4..a1f2eb9d9ec 100755 --- a/docker/test/performance-comparison/report.py +++ b/docker/test/performance-comparison/report.py @@ -626,7 +626,9 @@ if args.report == "main": message_array.append(str(faster_queries) + " faster") if slower_queries: - if slower_queries > 3: + # This threshold should be synchronized with the value in https://github.com/ClickHouse/ClickHouse/blob/master/tests/ci/performance_comparison_check.py#L225 + # False positives rate should be < 1%: https://shorturl.at/CDEK8 + if slower_queries > 5: status = "failure" message_array.append(str(slower_queries) + " slower") diff --git a/tests/ci/performance_comparison_check.py b/tests/ci/performance_comparison_check.py index bf5704f31bd..1baf547816f 100644 --- a/tests/ci/performance_comparison_check.py +++ b/tests/ci/performance_comparison_check.py @@ -219,6 +219,12 @@ if __name__ == "__main__": except Exception: traceback.print_exc() + def too_many_slow(msg): + match = re.search("(|.* )(\d+) slower.*", msg) + # This threshold should be synchronized with the value in https://github.com/ClickHouse/ClickHouse/blob/master/docker/test/performance-comparison/report.py#L629 + threshold = 5 + return int(match.group(2).strip()) > threshold if match else False + # Try to fetch status from the report. status = "" message = "" @@ -236,7 +242,7 @@ if __name__ == "__main__": # TODO: Remove me, always green mode for the first time, unless errors status = "success" - if "errors" in message.lower(): + if "errors" in message.lower() or too_many_slow(message.lower()): status = "failure" # TODO: Remove until here except Exception: From b091d85bb1b38bd2bcccf8ed1c1588a56e06b7a7 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 2 Jun 2023 23:30:47 +0200 Subject: [PATCH 0347/1072] Remove strange object storage methods --- .../AzureBlobStorage/AzureObjectStorage.cpp | 70 ++++++++++++------- .../AzureBlobStorage/AzureObjectStorage.h | 4 +- .../Cached/CachedObjectStorage.cpp | 4 +- .../Cached/CachedObjectStorage.h | 2 +- .../DiskObjectStorageMetadata.cpp | 8 +-- .../DiskObjectStorageMetadata.h | 4 +- ...jectStorageRemoteMetadataRestoreHelper.cpp | 12 ++-- src/Disks/ObjectStorages/IObjectStorage.cpp | 26 +++++-- src/Disks/ObjectStorages/IObjectStorage.h | 68 +++++++----------- .../MetadataStorageFromDisk.cpp | 4 +- .../MetadataStorageFromPlainObjectStorage.cpp | 52 +++++++------- .../ObjectStorages/S3/S3ObjectStorage.cpp | 68 +++++------------- src/Disks/ObjectStorages/S3/S3ObjectStorage.h | 7 +- 13 files changed, 156 insertions(+), 173 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index c4dd0161c70..0044f465081 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -67,6 +67,49 @@ bool AzureObjectStorage::exists(const StoredObject & object) const return false; } +void AzureObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const +{ + auto client_ptr = client.get(); + + /// What a shame, no Exists method... + Azure::Storage::Blobs::ListBlobsOptions options; + options.Prefix = path; + if (max_keys) + options.PageSizeHint = max_keys; + else + options.PageSizeHint = settings.get()->list_object_keys_size; + Azure::Storage::Blobs::ListBlobsPagedResponse blob_list_response; + + while (true) + { + blob_list_response = client_ptr->ListBlobs(options); + auto blobs_list = blob_list_response.Blobs; + + for (const auto & blob : blobs_list) + { + children.emplace_back( + blob.Name, + ObjectMetadata{ + static_cast(blob.BlobSize), + blob.Details.LastModified.time_since_epoch().count(), + {}}); + } + + if (max_keys) + { + int keys_left = max_keys - static_cast(children.size()); + if (keys_left <= 0) + break; + options.PageSizeHint = keys_left; + } + + if (blob_list_response.HasPage()) + options.ContinuationToken = blob_list_response.NextPageToken; + else + break; + } +} + std::unique_ptr AzureObjectStorage::readObject( /// NOLINT const StoredObject & object, const ReadSettings & read_settings, @@ -146,33 +189,6 @@ std::unique_ptr AzureObjectStorage::writeObject( /// NO patchSettings(write_settings)); } -void AzureObjectStorage::findAllFiles(const std::string & path, RelativePathsWithSize & children, int max_keys) const -{ - auto client_ptr = client.get(); - - Azure::Storage::Blobs::ListBlobsOptions blobs_list_options; - blobs_list_options.Prefix = path; - if (max_keys) - blobs_list_options.PageSizeHint = max_keys; - else - blobs_list_options.PageSizeHint = settings.get()->list_object_keys_size; - - auto blobs_list_response = client_ptr->ListBlobs(blobs_list_options); - for (;;) - { - auto blobs_list = blobs_list_response.Blobs; - - for (const auto & blob : blobs_list) - children.emplace_back(blob.Name, blob.BlobSize); - - if (max_keys && children.size() >= static_cast(max_keys)) - break; - if (!blobs_list_response.HasPage()) - break; - blobs_list_response.MoveToNextPage(); - } -} - /// Remove file. Throws exception if file doesn't exists or it's a directory. void AzureObjectStorage::removeObject(const StoredObject & object) { diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 0c2aecd5c62..a36a03bcda4 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -58,6 +58,8 @@ public: AzureClientPtr && client_, SettingsPtr && settings_); + void listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const override; + DataSourceDescription getDataSourceDescription() const override { return data_source_description; } std::string getName() const override { return "AzureObjectStorage"; } @@ -84,8 +86,6 @@ public: size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, const WriteSettings & write_settings = {}) override; - void findAllFiles(const std::string & path, RelativePathsWithSize & children, int max_keys) const override; - /// Remove file. Throws exception if file doesn't exists or it's a directory. void removeObject(const StoredObject & object) override; diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp index acf9430e85c..1d24d9d5411 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp @@ -201,9 +201,9 @@ std::unique_ptr CachedObjectStorage::cloneObjectStorage( return object_storage->cloneObjectStorage(new_namespace, config, config_prefix, context); } -void CachedObjectStorage::findAllFiles(const std::string & path, RelativePathsWithSize & children, int max_keys) const +void CachedObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const { - object_storage->findAllFiles(path, children, max_keys); + object_storage->listObjects(path, children, max_keys); } ObjectMetadata CachedObjectStorage::getObjectMetadata(const std::string & path) const diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h index f8e346e1aed..b5186d39c32 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h @@ -71,7 +71,7 @@ public: const std::string & config_prefix, ContextPtr context) override; - void findAllFiles(const std::string & path, RelativePathsWithSize & children, int max_keys) const override; + void listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const override; ObjectMetadata getObjectMetadata(const std::string & path) const override; diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp index c3284b635da..19d5a8e3567 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp @@ -54,7 +54,7 @@ void DiskObjectStorageMetadata::deserialize(ReadBuffer & buf) assertChar('\n', buf); storage_objects[i].relative_path = object_relative_path; - storage_objects[i].bytes_size = object_size; + storage_objects[i].metadata.size_bytes = object_size; } readIntText(ref_count, buf); @@ -93,9 +93,9 @@ void DiskObjectStorageMetadata::serialize(WriteBuffer & buf, bool sync) const writeIntText(total_size, buf); writeChar('\n', buf); - for (const auto & [object_relative_path, object_size] : storage_objects) + for (const auto & [object_relative_path, object_metadata] : storage_objects) { - writeIntText(object_size, buf); + writeIntText(object_metadata.size_bytes, buf); writeChar('\t', buf); writeEscapedString(object_relative_path, buf); writeChar('\n', buf); @@ -139,7 +139,7 @@ DiskObjectStorageMetadata::DiskObjectStorageMetadata( void DiskObjectStorageMetadata::addObject(const String & path, size_t size) { total_size += size; - storage_objects.emplace_back(path, size); + storage_objects.emplace_back(path, ObjectMetadata{size, {}, {}}); } diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h index a2d0653e4aa..6dced85d0b1 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h +++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h @@ -21,7 +21,7 @@ private: const std::string & common_metadata_path; /// Relative paths of blobs. - RelativePathsWithSize storage_objects; + RelativePathsWithMetadata storage_objects; const std::string object_storage_root_path; @@ -63,7 +63,7 @@ public: return object_storage_root_path; } - RelativePathsWithSize getBlobsRelativePaths() const + RelativePathsWithMetadata getBlobsRelativePaths() const { return storage_objects; } diff --git a/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp b/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp index 4cca89b9a4f..74d1698bf01 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp @@ -356,7 +356,7 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::restoreFiles(IObjectStorage * LOG_INFO(disk->log, "Starting restore files for disk {}", disk->name); std::vector> results; - auto restore_files = [this, &source_object_storage, &restore_information, &results](const RelativePathsWithSize & objects) + auto restore_files = [this, &source_object_storage, &restore_information, &results](const RelativePathsWithMetadata & objects) { std::vector keys_names; for (const auto & object : objects) @@ -389,8 +389,8 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::restoreFiles(IObjectStorage * return true; }; - RelativePathsWithSize children; - source_object_storage->findAllFiles(restore_information.source_path, children, /* max_keys= */ 0); + RelativePathsWithMetadata children; + source_object_storage->listObjects(restore_information.source_path, children, /* max_keys= */ 0); restore_files(children); @@ -472,7 +472,7 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::restoreFileOperations(IObject || disk->object_storage_root_path != restore_information.source_path; std::set renames; - auto restore_file_operations = [this, &source_object_storage, &restore_information, &renames, &send_metadata](const RelativePathsWithSize & objects) + auto restore_file_operations = [this, &source_object_storage, &restore_information, &renames, &send_metadata](const RelativePathsWithMetadata & objects) { const String rename = "rename"; const String hardlink = "hardlink"; @@ -539,8 +539,8 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::restoreFileOperations(IObject return true; }; - RelativePathsWithSize children; - source_object_storage->findAllFiles(restore_information.source_path + "operations/", children, /* max_keys= */ 0); + RelativePathsWithMetadata children; + source_object_storage->listObjects(restore_information.source_path + "operations/", children, /* max_keys= */ 0); restore_file_operations(children); if (restore_information.detached) diff --git a/src/Disks/ObjectStorages/IObjectStorage.cpp b/src/Disks/ObjectStorages/IObjectStorage.cpp index a810db0cdf8..a5903f9d429 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.cpp +++ b/src/Disks/ObjectStorages/IObjectStorage.cpp @@ -16,15 +16,29 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -void IObjectStorage::findAllFiles(const std::string &, RelativePathsWithSize &, int) const +bool IObjectStorage::existsOrHasAnyChild(const std::string & path) const { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "findAllFiles() is not supported"); + RelativePathsWithMetadata files; + listObjects(path, files, 1); + return !files.empty(); } -void IObjectStorage::getDirectoryContents(const std::string &, - RelativePathsWithSize &, - std::vector &) const + +void IObjectStorage::listObjects(const std::string &, RelativePathsWithMetadata &, int) const { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "getDirectoryContents() is not supported"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "listObjects() is not supported"); +} + + +std::optional IObjectStorage::tryGetObjectMetadata(const std::string & path) const +{ + try + { + return getObjectMetadata(path); + } + catch (...) + { + return {}; + } } ThreadPool & IObjectStorage::getThreadPoolWriter() diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 8babb2fbf1a..3a0bf1834a1 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -30,19 +30,6 @@ class WriteBufferFromFileBase; using ObjectAttributes = std::map; -struct RelativePathWithSize -{ - String relative_path; - size_t bytes_size; - - RelativePathWithSize() = default; - - RelativePathWithSize(const String & relative_path_, size_t bytes_size_) - : relative_path(relative_path_), bytes_size(bytes_size_) {} -}; -using RelativePathsWithSize = std::vector; - - struct ObjectMetadata { uint64_t size_bytes; @@ -50,6 +37,22 @@ struct ObjectMetadata std::optional attributes; }; +struct RelativePathWithMetadata +{ + String relative_path; + ObjectMetadata metadata{}; + + RelativePathWithMetadata() = default; + + RelativePathWithMetadata(const String & relative_path_, const ObjectMetadata & metadata_) + : relative_path(relative_path_), metadata(metadata_) + {} +}; + +using RelativePathsWithMetadata = std::vector; + + + /// Base class for all object storages which implement some subset of ordinary filesystem operations. /// /// Examples of object storages are S3, Azure Blob Storage, HDFS. @@ -65,36 +68,17 @@ public: /// Object exists or not virtual bool exists(const StoredObject & object) const = 0; - /// List all objects with specific prefix. - /// - /// For example if you do this over filesystem, you should skip folders and - /// return files only, so something like on local filesystem: - /// - /// find . -type f - /// - /// @param children - out files (relative paths) with their sizes. - /// @param max_keys - return not more then max_keys children - /// NOTE: max_keys is not the same as list_object_keys_size (disk property) - /// - if max_keys is set not more then max_keys keys should be returned - /// - however list_object_keys_size determine the size of the batch and should return all keys - /// - /// NOTE: It makes sense only for real object storages (S3, Azure), since - /// it is used only for one of the following: - /// - send_metadata (to restore metadata) - /// - see DiskObjectStorage::restoreMetadataIfNeeded() - /// - MetadataStorageFromPlainObjectStorage - only for s3_plain disk - virtual void findAllFiles(const std::string & path, RelativePathsWithSize & children, int max_keys) const; + /// Object exists or any child on the specified path exists. + /// We have this method because object storages are flat for example + /// /a/b/c/d may exist but /a/b/c may not. So this method will return true for + /// /, /a, /a/b, /a/b/c, /a/b/c/d while exists will return true only for /a/b/c/d + virtual bool existsOrHasAnyChild(const std::string & path) const; - /// Analog of directory content for object storage (object storage does not - /// have "directory" definition, but it can be emulated with usage of - /// "delimiter"), so this is analog of: - /// - /// find . -maxdepth 1 $path - /// - /// Return files in @files and directories in @directories - virtual void getDirectoryContents(const std::string & path, - RelativePathsWithSize & files, - std::vector & directories) const; + virtual void listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const; + + /// Get object metadata if supported. It should be possible to receive + /// at least size of object + virtual std::optional tryGetObjectMetadata(const std::string & path) const; /// Get object metadata if supported. It should be possible to receive /// at least size of object diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp b/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp index 6adf24b5bda..9461a82845f 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp @@ -142,10 +142,10 @@ StoredObjects MetadataStorageFromDisk::getStorageObjects(const std::string & pat object_storage_paths.reserve(object_storage_relative_paths.size()); /// Relative paths -> absolute. - for (auto & [object_relative_path, size] : object_storage_relative_paths) + for (auto & [object_relative_path, object_meta] : object_storage_relative_paths) { auto object_path = fs::path(metadata->getBlobsCommonPrefix()) / object_relative_path; - StoredObject object{ object_path, size, path }; + StoredObject object{ object_path, object_meta.size_bytes, path }; object_storage_paths.push_back(object); } diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp index a680a344746..2459fa38da3 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp @@ -39,11 +39,10 @@ std::filesystem::path MetadataStorageFromPlainObjectStorage::getAbsolutePath(con bool MetadataStorageFromPlainObjectStorage::exists(const std::string & path) const { - RelativePathsWithSize children; /// NOTE: exists() cannot be used here since it works only for existing /// key, and does not work for some intermediate path. - object_storage->findAllFiles(getAbsolutePath(path), children, 1); - return !children.empty(); + std::string abs_path = getAbsolutePath(path); + return object_storage->existsOrHasAnyChild(abs_path); } bool MetadataStorageFromPlainObjectStorage::isFile(const std::string & path) const @@ -55,44 +54,47 @@ bool MetadataStorageFromPlainObjectStorage::isFile(const std::string & path) con bool MetadataStorageFromPlainObjectStorage::isDirectory(const std::string & path) const { std::string directory = getAbsolutePath(path); - trimRight(directory); - directory += "/"; + if (!directory.ends_with('/')) + directory += '/'; - /// NOTE: This check is far from ideal, since it work only if the directory - /// really has files, and has excessive API calls - RelativePathsWithSize files; - std::vector directories; - object_storage->getDirectoryContents(directory, files, directories); - return !files.empty() || !directories.empty(); + RelativePathsWithMetadata files; + object_storage->listObjects(directory, files, 1); + return !files.empty(); } uint64_t MetadataStorageFromPlainObjectStorage::getFileSize(const String & path) const { - RelativePathsWithSize children; - object_storage->findAllFiles(getAbsolutePath(path), children, 1); - if (children.empty()) - return 0; - if (children.size() != 1) - throw Exception(ErrorCodes::LOGICAL_ERROR, "findAllFiles() return multiple paths ({}) for {}", children.size(), path); - return children.front().bytes_size; + RelativePathsWithMetadata children; + auto metadata = object_storage->tryGetObjectMetadata(getAbsolutePath(path)); + if (metadata) + return metadata->size_bytes; + return 0; } std::vector MetadataStorageFromPlainObjectStorage::listDirectory(const std::string & path) const { - RelativePathsWithSize files; - std::vector directories; - object_storage->getDirectoryContents(getAbsolutePath(path), files, directories); + RelativePathsWithMetadata files; + std::string abs_path = getAbsolutePath(path); + if (!abs_path.ends_with('/')) + abs_path += '/'; + + object_storage->listObjects(abs_path, files, 0); std::vector result; for (const auto & path_size : files) + { result.push_back(path_size.relative_path); - for (const auto & directory : directories) - result.push_back(directory); + } + for (auto & row : result) { - chassert(row.starts_with(object_storage_root_path)); - row.erase(0, object_storage_root_path.size()); + chassert(row.starts_with(abs_path)); + row.erase(0, abs_path.size()); + auto slash_pos = row.find_first_of('/'); + if (slash_pos != std::string::npos) + row.erase(slash_pos, row.size() - slash_pos); } + return result; } diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 79e9e1141bb..fc7ca4c35b5 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -183,7 +183,7 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN disk_write_settings); } -void S3ObjectStorage::findAllFiles(const std::string & path, RelativePathsWithSize & children, int max_keys) const +void S3ObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const { auto settings_ptr = s3_settings.get(); auto client_ptr = client.get(); @@ -211,7 +211,7 @@ void S3ObjectStorage::findAllFiles(const std::string & path, RelativePathsWithSi break; for (const auto & object : objects) - children.emplace_back(object.GetKey(), object.GetSize()); + children.emplace_back(object.GetKey(), ObjectMetadata{static_cast(object.GetSize()), object.GetLastModified().Millis() / 1000, {}}); if (max_keys) { @@ -225,54 +225,6 @@ void S3ObjectStorage::findAllFiles(const std::string & path, RelativePathsWithSi } while (outcome.GetResult().GetIsTruncated()); } -void S3ObjectStorage::getDirectoryContents(const std::string & path, - RelativePathsWithSize & files, - std::vector & directories) const -{ - auto settings_ptr = s3_settings.get(); - auto client_ptr = client.get(); - - S3::ListObjectsV2Request request; - request.SetBucket(bucket); - /// NOTE: if you do "ls /foo" instead of "ls /foo/" over S3 with this API - /// it will return only "/foo" itself without any underlying nodes. - if (path.ends_with("/")) - request.SetPrefix(path); - else - request.SetPrefix(path + "/"); - request.SetMaxKeys(settings_ptr->list_object_keys_size); - request.SetDelimiter("/"); - - Aws::S3::Model::ListObjectsV2Outcome outcome; - do - { - ProfileEvents::increment(ProfileEvents::S3ListObjects); - ProfileEvents::increment(ProfileEvents::DiskS3ListObjects); - outcome = client_ptr->ListObjectsV2(request); - throwIfError(outcome); - - auto result = outcome.GetResult(); - auto result_objects = result.GetContents(); - auto result_common_prefixes = result.GetCommonPrefixes(); - - if (result_objects.empty() && result_common_prefixes.empty()) - break; - - for (const auto & object : result_objects) - files.emplace_back(object.GetKey(), object.GetSize()); - - for (const auto & common_prefix : result_common_prefixes) - { - std::string directory = common_prefix.GetPrefix(); - /// Make it compatible with std::filesystem::path::filename() - trimRight(directory, '/'); - directories.emplace_back(directory); - } - - request.SetContinuationToken(outcome.GetResult().GetNextContinuationToken()); - } while (outcome.GetResult().GetIsTruncated()); -} - void S3ObjectStorage::removeObjectImpl(const StoredObject & object, bool if_exists) { auto client_ptr = client.get(); @@ -359,6 +311,22 @@ void S3ObjectStorage::removeObjectsIfExist(const StoredObjects & objects) removeObjectsImpl(objects, true); } +std::optional S3ObjectStorage::tryGetObjectMetadata(const std::string & path) const +{ + auto settings_ptr = s3_settings.get(); + auto object_info = S3::getObjectInfo(*client.get(), bucket, path, {}, settings_ptr->request_settings, /* with_metadata= */ true, /* for_disk_s3= */ true, /* throw_on_error= */ false); + + if (object_info.size == 0 && object_info.last_modification_time == 0 && object_info.metadata.empty()) + return {}; + + ObjectMetadata result; + result.size_bytes = object_info.size; + result.last_modified = object_info.last_modification_time; + result.attributes = object_info.metadata; + + return result; +} + ObjectMetadata S3ObjectStorage::getObjectMetadata(const std::string & path) const { auto settings_ptr = s3_settings.get(); diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index 70ed899586e..b0eb01aec0d 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -100,10 +100,7 @@ public: size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, const WriteSettings & write_settings = {}) override; - void findAllFiles(const std::string & path, RelativePathsWithSize & children, int max_keys) const override; - void getDirectoryContents(const std::string & path, - RelativePathsWithSize & files, - std::vector & directories) const override; + void listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const override; /// Uses `DeleteObjectRequest`. void removeObject(const StoredObject & object) override; @@ -121,6 +118,8 @@ public: ObjectMetadata getObjectMetadata(const std::string & path) const override; + std::optional tryGetObjectMetadata(const std::string & path) const override; + void copyObject( /// NOLINT const StoredObject & object_from, const StoredObject & object_to, From c73c836f9dfc74dbf5e9d213843c03a97e2a5735 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 2 Jun 2023 23:33:04 +0200 Subject: [PATCH 0348/1072] Better --- .../ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp index 2459fa38da3..1131ec313e4 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp @@ -86,6 +86,7 @@ std::vector MetadataStorageFromPlainObjectStorage::listDirectory(co result.push_back(path_size.relative_path); } + std::unordered_set duplicates_filter; for (auto & row : result) { chassert(row.starts_with(abs_path)); @@ -93,9 +94,10 @@ std::vector MetadataStorageFromPlainObjectStorage::listDirectory(co auto slash_pos = row.find_first_of('/'); if (slash_pos != std::string::npos) row.erase(slash_pos, row.size() - slash_pos); + duplicates_filter.insert(row); } - return result; + return std::vector(duplicates_filter.begin(), duplicates_filter.end()); } DirectoryIteratorPtr MetadataStorageFromPlainObjectStorage::iterateDirectory(const std::string & path) const From 530f743ed062157cb0fc74d3bbc0bf51b186c0b5 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Fri, 2 Jun 2023 23:41:25 +0200 Subject: [PATCH 0349/1072] Fix Object data type for StorageDistributed --- src/Storages/StorageDistributed.cpp | 3 ++- src/Storages/StorageDummy.cpp | 3 ++- src/Storages/StorageDummy.h | 9 ++++++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 43b1333413e..b9625ce2ab7 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -948,7 +948,8 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, auto resolved_remote_storage_id = remote_storage_id; if (remote_storage_id.hasDatabase()) resolved_remote_storage_id = query_context->resolveStorageID(remote_storage_id); - auto storage = std::make_shared(resolved_remote_storage_id, distributed_storage_snapshot->metadata->getColumns()); + + auto storage = std::make_shared(resolved_remote_storage_id, distributed_storage_snapshot->metadata->getColumns(), distributed_storage_snapshot->object_columns); auto table_node = std::make_shared(std::move(storage), query_context); if (table_expression_modifiers) diff --git a/src/Storages/StorageDummy.cpp b/src/Storages/StorageDummy.cpp index e5f3b0b4d8e..4f2fb3883bf 100644 --- a/src/Storages/StorageDummy.cpp +++ b/src/Storages/StorageDummy.cpp @@ -9,8 +9,9 @@ namespace DB { -StorageDummy::StorageDummy(const StorageID & table_id_, const ColumnsDescription & columns_) +StorageDummy::StorageDummy(const StorageID & table_id_, const ColumnsDescription & columns_, ColumnsDescription object_columns_) : IStorage(table_id_) + , object_columns(std::move(object_columns_)) { StorageInMemoryMetadata storage_metadata; storage_metadata.setColumns(columns_); diff --git a/src/Storages/StorageDummy.h b/src/Storages/StorageDummy.h index a7beef9d531..2f9a8beb4d0 100644 --- a/src/Storages/StorageDummy.h +++ b/src/Storages/StorageDummy.h @@ -11,7 +11,7 @@ namespace DB class StorageDummy : public IStorage { public: - StorageDummy(const StorageID & table_id_, const ColumnsDescription & columns_); + StorageDummy(const StorageID & table_id_, const ColumnsDescription & columns_, ColumnsDescription object_columns_ = {}); std::string getName() const override { return "StorageDummy"; } @@ -22,6 +22,11 @@ public: bool supportsDynamicSubcolumns() const override { return true; } bool canMoveConditionsToPrewhere() const override { return false; } + StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr /*query_context*/) const override + { + return std::make_shared(*this, metadata_snapshot, object_columns); + } + QueryProcessingStage::Enum getQueryProcessingStage( ContextPtr local_context, QueryProcessingStage::Enum to_stage, @@ -37,6 +42,8 @@ public: QueryProcessingStage::Enum processed_stage, size_t max_block_size, size_t num_streams) override; +private: + const ColumnsDescription object_columns; }; class ReadFromDummy : public SourceStepWithFilter From e548dce123debf4864348d606629f90844b5e5f8 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Sat, 3 Jun 2023 00:08:47 +0200 Subject: [PATCH 0350/1072] fix --- tests/ci/performance_comparison_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/performance_comparison_check.py b/tests/ci/performance_comparison_check.py index 1baf547816f..41ace95c350 100644 --- a/tests/ci/performance_comparison_check.py +++ b/tests/ci/performance_comparison_check.py @@ -220,7 +220,7 @@ if __name__ == "__main__": traceback.print_exc() def too_many_slow(msg): - match = re.search("(|.* )(\d+) slower.*", msg) + match = re.search(r"(|.* )(\d+) slower.*", msg) # This threshold should be synchronized with the value in https://github.com/ClickHouse/ClickHouse/blob/master/docker/test/performance-comparison/report.py#L629 threshold = 5 return int(match.group(2).strip()) > threshold if match else False From 30be723a9aca0a585e92dbaee71d15905d6dc490 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 3 Jun 2023 05:11:02 +0200 Subject: [PATCH 0351/1072] Fix bad code around metadata in RocksDB --- src/Common/ProfileEvents.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 9f4fc2d135b..2e0c4b82bba 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -429,10 +429,10 @@ The server successfully detected this situation and will download merged part fr M(AggregationPreallocatedElementsInHashTables, "How many elements were preallocated in hash tables for aggregation.") \ M(AggregationHashTablesInitializedAsTwoLevel, "How many hash tables were inited as two-level for aggregation.") \ \ - M(MergeTreeMetadataCacheGet, "Number of rocksdb reads(used for merge tree metadata cache)") \ - M(MergeTreeMetadataCachePut, "Number of rocksdb puts(used for merge tree metadata cache)") \ - M(MergeTreeMetadataCacheDelete, "Number of rocksdb deletes(used for merge tree metadata cache)") \ - M(MergeTreeMetadataCacheSeek, "Number of rocksdb seeks(used for merge tree metadata cache)") \ + M(MergeTreeMetadataCacheGet, "Number of rocksdb reads (used for merge tree metadata cache)") \ + M(MergeTreeMetadataCachePut, "Number of rocksdb puts (used for merge tree metadata cache)") \ + M(MergeTreeMetadataCacheDelete, "Number of rocksdb deletes (used for merge tree metadata cache)") \ + M(MergeTreeMetadataCacheSeek, "Number of rocksdb seeks (used for merge tree metadata cache)") \ M(MergeTreeMetadataCacheHit, "Number of times the read of meta file was done from MergeTree metadata cache") \ M(MergeTreeMetadataCacheMiss, "Number of times the read of meta file was not done from MergeTree metadata cache") \ \ From ccba3500dd92eaecd38a56d92b09336af26f371c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 3 Jun 2023 07:13:26 +0300 Subject: [PATCH 0352/1072] Update StoragePostgreSQL.cpp --- src/Storages/StoragePostgreSQL.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/StoragePostgreSQL.cpp b/src/Storages/StoragePostgreSQL.cpp index ab6660abe00..431285da60d 100644 --- a/src/Storages/StoragePostgreSQL.cpp +++ b/src/Storages/StoragePostgreSQL.cpp @@ -247,6 +247,7 @@ public: if (nested_type->isNullable()) nested_type = static_cast(nested_type.get())->getNestedType(); + /// UUIDs inside arrays are expected to be unquoted in PostgreSQL. const bool quoted = !isUUID(nested_type); writeChar('{', ostr); From 7a7e03a2ffbd879afd5971de6de13c7919a89157 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Sat, 3 Jun 2023 14:16:59 +0300 Subject: [PATCH 0353/1072] Function if constant folding --- src/Functions/if.cpp | 25 +++++++++++++++++++ .../25337_if_constant_folding.reference | 5 ++++ .../0_stateless/25337_if_constant_folding.sql | 1 + 3 files changed, 31 insertions(+) create mode 100644 tests/queries/0_stateless/25337_if_constant_folding.reference create mode 100644 tests/queries/0_stateless/25337_if_constant_folding.sql diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index 93bdf406f9d..d00e83c4eb7 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -1116,6 +1116,31 @@ public: return res; } + + ColumnPtr getConstantResultForNonConstArguments(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type) const override + { + const ColumnWithTypeAndName & arg_cond = arguments[0]; + if (!arg_cond.column || !isColumnConst(*arg_cond.column)) { + return {}; + } + + const ColumnConst * cond_const_col = checkAndGetColumnConst>(arg_cond.column.get()); + bool condition_value = cond_const_col->getValue(); + + const ColumnWithTypeAndName & arg_then = arguments[1]; + const ColumnWithTypeAndName & arg_else = arguments[2]; + const ColumnWithTypeAndName & potential_const_column = condition_value ? arg_then : arg_else; + + if (!potential_const_column.column || !isColumnConst(*potential_const_column.column)) + return {}; + + auto result = castColumn(potential_const_column, result_type); + if (!isColumnConst(*result)) { + return {}; + } + + return result; + } }; } diff --git a/tests/queries/0_stateless/25337_if_constant_folding.reference b/tests/queries/0_stateless/25337_if_constant_folding.reference new file mode 100644 index 00000000000..9dfcf39f5a7 --- /dev/null +++ b/tests/queries/0_stateless/25337_if_constant_folding.reference @@ -0,0 +1,5 @@ +0 +1 +2 +3 +4 diff --git a/tests/queries/0_stateless/25337_if_constant_folding.sql b/tests/queries/0_stateless/25337_if_constant_folding.sql new file mode 100644 index 00000000000..1610465021b --- /dev/null +++ b/tests/queries/0_stateless/25337_if_constant_folding.sql @@ -0,0 +1 @@ +SELECT cast(number, if(1 = 1, 'UInt64', toString(number))) FROM numbers(5); From 894457d6e1cf7f005e1eb3cae57dded0dd8c4699 Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 3 Jun 2023 14:36:52 +0200 Subject: [PATCH 0354/1072] Style fxi --- src/Disks/ObjectStorages/IObjectStorage.h | 1 - .../ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp | 5 ----- 2 files changed, 6 deletions(-) diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 3a0bf1834a1..28de80a88cd 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -52,7 +52,6 @@ struct RelativePathWithMetadata using RelativePathsWithMetadata = std::vector; - /// Base class for all object storages which implement some subset of ordinary filesystem operations. /// /// Examples of object storages are S3, Azure Blob Storage, HDFS. diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp index 1131ec313e4..c119e9f3adc 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp @@ -10,11 +10,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - MetadataStorageFromPlainObjectStorage::MetadataStorageFromPlainObjectStorage( ObjectStoragePtr object_storage_, const std::string & object_storage_root_path_) From 13a122697139f80f34ca006f691fd1f4f20e8528 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 25 May 2023 16:09:05 +0200 Subject: [PATCH 0355/1072] Fix nested distributed SELECT in WITH clause For the CSE (common scalar expression, form of CTE) form of WITH it will set max_result_rows to 1, since there should not be more rows, but this will be applied for the DESC TABLE as well (service distributed query that required to obtain the structure). Note, that it is a problem only for nested distributed queries because getSubqueryContext() from InterpreterSelectQuery.cpp resets those limits as well, but this does not helps, for the nested DESC since it is executed on the remote node that has max_result_rows=1. Signed-off-by: Azat Khuzhin --- src/Storages/getStructureOfRemoteTable.cpp | 9 +++++++++ .../0_stateless/02768_cse_nested_distributed.reference | 3 +++ .../queries/0_stateless/02768_cse_nested_distributed.sql | 5 +++++ 3 files changed, 17 insertions(+) create mode 100644 tests/queries/0_stateless/02768_cse_nested_distributed.reference create mode 100644 tests/queries/0_stateless/02768_cse_nested_distributed.sql diff --git a/src/Storages/getStructureOfRemoteTable.cpp b/src/Storages/getStructureOfRemoteTable.cpp index b2737249166..e5fc01be9f4 100644 --- a/src/Storages/getStructureOfRemoteTable.cpp +++ b/src/Storages/getStructureOfRemoteTable.cpp @@ -60,6 +60,15 @@ ColumnsDescription getStructureOfRemoteTableInShard( ColumnsDescription res; auto new_context = ClusterProxy::updateSettingsForCluster(cluster, context, context->getSettingsRef(), table_id); + /// Ignore limit for result number of rows (that could be set during handling CSE/CTE), + /// since this is a service query and should not lead to query failure. + { + Settings new_settings = new_context->getSettings(); + new_settings.max_result_rows = 0; + new_settings.max_result_bytes = 0; + new_context->setSettings(new_settings); + } + /// Expect only needed columns from the result of DESC TABLE. NOTE 'comment' column is ignored for compatibility reasons. Block sample_block { diff --git a/tests/queries/0_stateless/02768_cse_nested_distributed.reference b/tests/queries/0_stateless/02768_cse_nested_distributed.reference new file mode 100644 index 00000000000..e8183f05f5d --- /dev/null +++ b/tests/queries/0_stateless/02768_cse_nested_distributed.reference @@ -0,0 +1,3 @@ +1 +1 +1 diff --git a/tests/queries/0_stateless/02768_cse_nested_distributed.sql b/tests/queries/0_stateless/02768_cse_nested_distributed.sql new file mode 100644 index 00000000000..90e526c0d01 --- /dev/null +++ b/tests/queries/0_stateless/02768_cse_nested_distributed.sql @@ -0,0 +1,5 @@ +with (select count() > 0 from remote('127.2', system.settings)) as s select s; +-- nested +with (select count() > 0 from remote('127.2', remote('127.2', system.settings))) as s select s; +-- nested via view() +with (select count() > 0 from remote('127.2', view(select count() from remote('127.2', system.settings)))) as s select s; From e28dfb7ea851844c943a4dbab33dcf6d2f468f4e Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Sat, 3 Jun 2023 21:44:31 +0300 Subject: [PATCH 0356/1072] Updated tests --- tests/queries/0_stateless/00835_if_generic_case.reference | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/00835_if_generic_case.reference b/tests/queries/0_stateless/00835_if_generic_case.reference index 84c036b17ca..8f9c3f6ef13 100644 --- a/tests/queries/0_stateless/00835_if_generic_case.reference +++ b/tests/queries/0_stateless/00835_if_generic_case.reference @@ -3,7 +3,7 @@ 2000-01-01 00:00:00 2000-01-02 2000-01-02 00:00:00 2000-01-01 00:00:00 2000-01-02 2000-01-02 00:00:00 2000-01-01 00:00:00 2000-01-02 2000-01-02 00:00:00 -2000-01-01 00:00:00 2000-01-02 2000-01-02 +2000-01-01 00:00:00 2000-01-02 2000-01-02 00:00:00 2000-01-01 00:00:00 2000-01-02 2000-01-02 2000-01-01 00:00:00 2000-01-02 2000-01-02 2000-01-01 00:00:00 2000-01-02 2000-01-01 00:00:00 From e61131c4bb7fda6a6883c4b3946e6d5862547728 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Sat, 3 Jun 2023 21:45:53 +0300 Subject: [PATCH 0357/1072] QueryNode small fix --- src/Analyzer/QueryNode.cpp | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/Analyzer/QueryNode.cpp b/src/Analyzer/QueryNode.cpp index 4c10d76690a..51e3dac781d 100644 --- a/src/Analyzer/QueryNode.cpp +++ b/src/Analyzer/QueryNode.cpp @@ -202,15 +202,16 @@ bool QueryNode::isEqualImpl(const IQueryTreeNode & rhs) const return is_subquery == rhs_typed.is_subquery && is_cte == rhs_typed.is_cte && - cte_name == rhs_typed.cte_name && - projection_columns == rhs_typed.projection_columns && is_distinct == rhs_typed.is_distinct && is_limit_with_ties == rhs_typed.is_limit_with_ties && is_group_by_with_totals == rhs_typed.is_group_by_with_totals && is_group_by_with_rollup == rhs_typed.is_group_by_with_rollup && is_group_by_with_cube == rhs_typed.is_group_by_with_cube && is_group_by_with_grouping_sets == rhs_typed.is_group_by_with_grouping_sets && - is_group_by_all == rhs_typed.is_group_by_all; + is_group_by_all == rhs_typed.is_group_by_all && + cte_name == rhs_typed.cte_name && + projection_columns == rhs_typed.projection_columns && + settings_changes == rhs_typed.settings_changes; } void QueryNode::updateTreeHashImpl(HashState & state) const @@ -239,6 +240,18 @@ void QueryNode::updateTreeHashImpl(HashState & state) const state.update(is_group_by_with_cube); state.update(is_group_by_with_grouping_sets); state.update(is_group_by_all); + + state.update(settings_changes.size()); + + for (const auto & setting_change : settings_changes) + { + state.update(setting_change.name.size()); + state.update(setting_change.name); + + auto setting_change_value_dump = setting_change.value.dump(); + state.update(setting_change_value_dump.size()); + state.update(setting_change_value_dump); + } } QueryTreeNodePtr QueryNode::cloneImpl() const @@ -256,6 +269,7 @@ QueryTreeNodePtr QueryNode::cloneImpl() const result_query_node->is_group_by_all = is_group_by_all; result_query_node->cte_name = cte_name; result_query_node->projection_columns = projection_columns; + result_query_node->settings_changes = settings_changes; return result_query_node; } From 0f37be549236f4b891ec324abd73adfdcc68f0b0 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 25 May 2023 17:20:07 +0200 Subject: [PATCH 0358/1072] Fix description for BrokenDistributedFilesToInsert Signed-off-by: Azat Khuzhin --- src/Common/CurrentMetrics.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 82d68ca8185..63a23c30f84 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -146,7 +146,7 @@ M(OutdatedPartsLoadingThreads, "Number of threads in the threadpool for loading Outdated data parts.") \ M(OutdatedPartsLoadingThreadsActive, "Number of active threads in the threadpool for loading Outdated data parts.") \ M(DistributedFilesToInsert, "Number of pending files to process for asynchronous insertion into Distributed tables. Number of files for every shard is summed.") \ - M(BrokenDistributedFilesToInsert, "Number of files for asynchronous insertion into Distributed tables that has been marked as broken. This metric will starts from 0 on start. Number of files for every shard is summed.") \ + M(BrokenDistributedFilesToInsert, "Number of files for asynchronous insertion into Distributed tables that has been marked as broken. Number of files for every shard is summed.") \ M(TablesToDropQueueSize, "Number of dropped tables, that are waiting for background data removal.") \ M(MaxDDLEntryID, "Max processed DDL entry of DDLWorker.") \ M(MaxPushedDDLEntryID, "Max DDL entry of DDLWorker that pushed to zookeeper.") \ From 69aec7af9bfa475bbc6581f4c582eff6db04d3de Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 25 May 2023 17:23:39 +0200 Subject: [PATCH 0359/1072] Add new metrics BrokenDistributedBytesToInsert/DistributedBytesToInsert Useful to see at the server status overall. Signed-off-by: Azat Khuzhin --- src/Common/CurrentMetrics.cpp | 2 ++ .../Distributed/DistributedAsyncInsertDirectoryQueue.cpp | 9 +++++++++ .../Distributed/DistributedAsyncInsertDirectoryQueue.h | 2 ++ 3 files changed, 13 insertions(+) diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 63a23c30f84..edfbbe17600 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -145,6 +145,8 @@ M(ParquetDecoderThreadsActive, "Number of threads in the ParquetBlockInputFormat thread pool.") \ M(OutdatedPartsLoadingThreads, "Number of threads in the threadpool for loading Outdated data parts.") \ M(OutdatedPartsLoadingThreadsActive, "Number of active threads in the threadpool for loading Outdated data parts.") \ + M(DistributedBytesToInsert, "Number of pending bytes to process for asynchronous insertion into Distributed tables. Number of bytes for every shard is summed.") \ + M(BrokenDistributedBytesToInsert, "Number of bytes for asynchronous insertion into Distributed tables that has been marked as broken. Number of bytes for every shard is summed.") \ M(DistributedFilesToInsert, "Number of pending files to process for asynchronous insertion into Distributed tables. Number of files for every shard is summed.") \ M(BrokenDistributedFilesToInsert, "Number of files for asynchronous insertion into Distributed tables that has been marked as broken. Number of files for every shard is summed.") \ M(TablesToDropQueueSize, "Number of dropped tables, that are waiting for background data removal.") \ diff --git a/src/Storages/Distributed/DistributedAsyncInsertDirectoryQueue.cpp b/src/Storages/Distributed/DistributedAsyncInsertDirectoryQueue.cpp index 9a9a6651bc4..d8d9a0c9d1e 100644 --- a/src/Storages/Distributed/DistributedAsyncInsertDirectoryQueue.cpp +++ b/src/Storages/Distributed/DistributedAsyncInsertDirectoryQueue.cpp @@ -35,6 +35,8 @@ namespace CurrentMetrics extern const Metric DistributedSend; extern const Metric DistributedFilesToInsert; extern const Metric BrokenDistributedFilesToInsert; + extern const Metric DistributedBytesToInsert; + extern const Metric BrokenDistributedBytesToInsert; } namespace fs = std::filesystem; @@ -138,7 +140,9 @@ DistributedAsyncInsertDirectoryQueue::DistributedAsyncInsertDirectoryQueue( , max_sleep_time(storage.getDistributedSettingsRef().monitor_max_sleep_time_ms.totalMilliseconds()) , log(&Poco::Logger::get(getLoggerName())) , monitor_blocker(monitor_blocker_) + , metric_pending_bytes(CurrentMetrics::DistributedBytesToInsert, 0) , metric_pending_files(CurrentMetrics::DistributedFilesToInsert, 0) + , metric_broken_bytes(CurrentMetrics::BrokenDistributedBytesToInsert, 0) , metric_broken_files(CurrentMetrics::BrokenDistributedFilesToInsert, 0) { fs::create_directory(broken_path); @@ -357,6 +361,7 @@ void DistributedAsyncInsertDirectoryQueue::initializeFilesFromDisk() LOG_TRACE(log, "Files set to {}", pending_files.size()); LOG_TRACE(log, "Bytes set to {}", bytes_count); + metric_pending_bytes.changeTo(bytes_count); metric_pending_files.changeTo(pending_files.size()); status.files_count = pending_files.size(); status.bytes_count = bytes_count; @@ -380,6 +385,7 @@ void DistributedAsyncInsertDirectoryQueue::initializeFilesFromDisk() LOG_TRACE(log, "Broken bytes set to {}", broken_bytes_count); metric_broken_files.changeTo(broken_files); + metric_broken_bytes.changeTo(broken_bytes_count); status.broken_files_count = broken_files; status.broken_bytes_count = broken_bytes_count; } @@ -520,6 +526,7 @@ bool DistributedAsyncInsertDirectoryQueue::addFileAndSchedule(const std::string { std::lock_guard lock(status_mutex); metric_pending_files.add(); + metric_pending_bytes.add(file_size); status.bytes_count += file_size; ++status.files_count; } @@ -679,6 +686,7 @@ void DistributedAsyncInsertDirectoryQueue::markAsBroken(const std::string & file status.broken_bytes_count += file_size; metric_broken_files.add(); + metric_broken_bytes.add(file_size); } fs::rename(file_path, broken_file_path); @@ -692,6 +700,7 @@ void DistributedAsyncInsertDirectoryQueue::markAsSend(const std::string & file_p { std::lock_guard status_lock(status_mutex); metric_pending_files.sub(); + metric_pending_bytes.sub(file_size); --status.files_count; status.bytes_count -= file_size; } diff --git a/src/Storages/Distributed/DistributedAsyncInsertDirectoryQueue.h b/src/Storages/Distributed/DistributedAsyncInsertDirectoryQueue.h index de8bb813824..9a8a235e265 100644 --- a/src/Storages/Distributed/DistributedAsyncInsertDirectoryQueue.h +++ b/src/Storages/Distributed/DistributedAsyncInsertDirectoryQueue.h @@ -149,7 +149,9 @@ private: BackgroundSchedulePoolTaskHolder task_handle; + CurrentMetrics::Increment metric_pending_bytes; CurrentMetrics::Increment metric_pending_files; + CurrentMetrics::Increment metric_broken_bytes; CurrentMetrics::Increment metric_broken_files; }; From 009fe3d25e8a755e86d45084f2af5784f4463523 Mon Sep 17 00:00:00 2001 From: kssenii Date: Sat, 3 Jun 2023 21:32:29 +0200 Subject: [PATCH 0360/1072] Add profile events for eviction --- src/Common/ProfileEvents.cpp | 3 +++ src/Interpreters/Cache/FileCache.cpp | 14 +++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 9f4fc2d135b..8146a5017ad 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -381,6 +381,9 @@ The server successfully detected this situation and will download merged part fr M(CachedWriteBufferCacheWriteBytes, "Bytes written from source (remote fs, etc) to filesystem cache") \ M(CachedWriteBufferCacheWriteMicroseconds, "Time spent writing data into filesystem cache") \ \ + M(FilesystemCacheEvictedBytes, "Number of bytes evicted from filesystem cache") \ + M(FilesystemCacheEvictedFileSegments, "Number of file segments evicted from filesystem cache") \ + \ M(RemoteFSSeeks, "Total number of seeks for async buffer") \ M(RemoteFSPrefetches, "Number of prefetches made with asynchronous reading from remote filesystem") \ M(RemoteFSCancelledPrefetches, "Number of cancelled prefecthes (because of seek)") \ diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp index ba160a31b73..79a9765108f 100644 --- a/src/Interpreters/Cache/FileCache.cpp +++ b/src/Interpreters/Cache/FileCache.cpp @@ -17,6 +17,12 @@ namespace fs = std::filesystem; +namespace ProfileEvents +{ + extern const Event FilesystemCacheEvictedBytes; + extern const Event FilesystemCacheEvictedFileSegments; +} + namespace { @@ -643,7 +649,9 @@ bool FileCache::tryReserve(FileSegment & file_segment, const size_t size) return PriorityIterationResult::CONTINUE; } - /// TODO: we can resize if partially downloaded instead. + ProfileEvents::increment(ProfileEvents::FilesystemCacheEvictedFileSegments); + ProfileEvents::increment(ProfileEvents::FilesystemCacheEvictedBytes, segment->range().size()); + locked_key.removeFileSegment(segment->offset(), segment->lock()); return PriorityIterationResult::REMOVE_AND_CONTINUE; } @@ -721,6 +729,10 @@ bool FileCache::tryReserve(FileSegment & file_segment, const size_t size) chassert(candidate->releasable()); const auto * segment = candidate->file_segment.get(); + + ProfileEvents::increment(ProfileEvents::FilesystemCacheEvictedFileSegments); + ProfileEvents::increment(ProfileEvents::FilesystemCacheEvictedBytes, segment->range().size()); + locked_key->removeFileSegment(segment->offset(), segment->lock()); segment->getQueueIterator()->remove(cache_lock); From a0df8566051a4c0c163dae14b5e41b7d83b7540c Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 3 Jun 2023 23:11:03 +0200 Subject: [PATCH 0361/1072] Able to insert --- .../AzureBlobStorage/AzureBlobStorageAuth.cpp | 17 +++++++++----- src/Storages/StorageAzure.cpp | 4 ++-- src/Storages/StorageAzure.h | 22 +++++++++++++------ 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index 1e06490b5bc..1b62b5fdb05 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -57,14 +57,22 @@ void validateContainerName(const String & container_name) AzureBlobStorageEndpoint processAzureBlobStorageEndpoint(const Poco::Util::AbstractConfiguration & config, const String & config_prefix) { - String storage_account_url = config.getString(config_prefix + ".storage_account_url"); - validateStorageAccountUrl(storage_account_url); + std::string storage_url; + if (config.has(config_prefix + ".storage_account_url")) + { + storage_url = config.getString(config_prefix + ".storage_account_url"); + validateStorageAccountUrl(storage_url); + } + else + { + storage_url = config.getString(config_prefix + ".connection_string"); + } String container_name = config.getString(config_prefix + ".container_name", "default-container"); validateContainerName(container_name); std::optional container_already_exists {}; if (config.has(config_prefix + ".container_already_exists")) container_already_exists = {config.getBool(config_prefix + ".container_already_exists")}; - return {storage_account_url, container_name, container_already_exists}; + return {storage_url, container_name, container_already_exists}; } @@ -136,10 +144,7 @@ std::unique_ptr getAzureBlobContainerClient( /// If container_already_exists is not set (in config), ignore already exists error. /// (Conflict - The specified container already exists) if (!endpoint.container_already_exists.has_value() && e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict) - { - tryLogCurrentException("Container already exists, returning the existing container"); return getAzureBlobStorageClientWithAuth(final_url, container_name, config, config_prefix); - } throw; } } diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index 30fd3fcbe95..fd250a128c9 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -236,7 +236,7 @@ AzureClientPtr StorageAzure::createClient(StorageAzure::Configuration configurat StorageAzure::StorageAzure( const Configuration & configuration_, std::unique_ptr && object_storage_, - ContextPtr context_, + ContextPtr, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -252,7 +252,7 @@ StorageAzure::StorageAzure( , partition_by(partition_by_) { FormatFactory::instance().checkFormatName(configuration.format); - context_->getGlobalContext()->getRemoteHostFilter().checkURL(Poco::URI(configuration.getConnectionURL())); + //context_->getGlobalContext()->getRemoteHostFilter().checkURL(Poco::URI(configuration.getConnectionURL())); StorageInMemoryMetadata storage_metadata; if (columns_.empty()) diff --git a/src/Storages/StorageAzure.h b/src/Storages/StorageAzure.h index b99df2e89a5..6bf18d91265 100644 --- a/src/Storages/StorageAzure.h +++ b/src/Storages/StorageAzure.h @@ -18,6 +18,11 @@ struct AzureSimpleAccountConfiguration std::string storage_account_url; }; +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + using AzureConnectionString = std::string; using AzureCredentials = std::variant; @@ -85,13 +90,16 @@ public: } Pipe read( - const Names & column_names, - const StorageSnapshotPtr & storage_snapshot, - SelectQueryInfo & query_info, - ContextPtr context, - QueryProcessingStage::Enum processed_stage, - size_t max_block_size, - size_t num_streams) override; + const Names &, + const StorageSnapshotPtr &, + SelectQueryInfo &, + ContextPtr, + QueryProcessingStage::Enum, + size_t, + size_t) override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Read not implemented"); + } SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /* metadata_snapshot */, ContextPtr context) override; From 57070227a8468f68f9caede43b4d93ad286a2d00 Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 3 Jun 2023 23:44:19 +0200 Subject: [PATCH 0362/1072] Fxi --- .../ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp | 4 +++- src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 0044f465081..0358b4e915a 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -91,7 +91,9 @@ void AzureObjectStorage::listObjects(const std::string & path, RelativePathsWith blob.Name, ObjectMetadata{ static_cast(blob.BlobSize), - blob.Details.LastModified.time_since_epoch().count(), + Poco::Timestamp::fromEpochTime( + std::chrono::duration_cast( + blob.Details.LastModified.time_since_epoch()).count()), {}}); } diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index fc7ca4c35b5..6e63efcc1e3 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -211,7 +211,7 @@ void S3ObjectStorage::listObjects(const std::string & path, RelativePathsWithMet break; for (const auto & object : objects) - children.emplace_back(object.GetKey(), ObjectMetadata{static_cast(object.GetSize()), object.GetLastModified().Millis() / 1000, {}}); + children.emplace_back(object.GetKey(), ObjectMetadata{static_cast(object.GetSize()), Poco::Timestamp::fromEpochTime(object.GetLastModified().Seconds()), {}}); if (max_keys) { From 6f01e2ad9fd25d3d09828d7fb8847d7e730ccb3f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 4 Jun 2023 02:49:45 +0200 Subject: [PATCH 0363/1072] Fix incorrect constant folding --- src/Parsers/ExpressionElementParsers.cpp | 6 ++- ..._formatting_and_constant_folding.reference | 4 ++ ...istent_formatting_and_constant_folding.sql | 41 +++++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02782_inconsistent_formatting_and_constant_folding.reference create mode 100644 tests/queries/0_stateless/02782_inconsistent_formatting_and_constant_folding.sql diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index 28cef51e571..3a7e8790bb4 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -829,7 +829,11 @@ static bool parseNumber(char * buffer, size_t size, bool negative, int base, Fie if (pos_integer == buffer + size && errno != ERANGE && (!negative || uint_value <= (1ULL << 63))) { - if (negative) + /// -0 should be still parsed as UInt instead of Int, + /// because otherwise it is not preserved during formatting-parsing roundtrip + /// (the signedness is lost during formatting) + + if (negative && uint_value != 0) res = static_cast(-uint_value); else res = uint_value; diff --git a/tests/queries/0_stateless/02782_inconsistent_formatting_and_constant_folding.reference b/tests/queries/0_stateless/02782_inconsistent_formatting_and_constant_folding.reference new file mode 100644 index 00000000000..39d23aa2298 --- /dev/null +++ b/tests/queries/0_stateless/02782_inconsistent_formatting_and_constant_folding.reference @@ -0,0 +1,4 @@ +0 UInt8 -1 Int8 -0 Float64 +0 +0 +0 diff --git a/tests/queries/0_stateless/02782_inconsistent_formatting_and_constant_folding.sql b/tests/queries/0_stateless/02782_inconsistent_formatting_and_constant_folding.sql new file mode 100644 index 00000000000..31278862ab8 --- /dev/null +++ b/tests/queries/0_stateless/02782_inconsistent_formatting_and_constant_folding.sql @@ -0,0 +1,41 @@ +SELECT -0, toTypeName(-0), -1, toTypeName(-1), -0., toTypeName(-0.); + +DROP TABLE IF EXISTS t4; +DROP TABLE IF EXISTS t7; + +create table t4 (c26 String) engine = Log; +create view t7 as select max(ref_3.c26) as c_2_c46_1 from t4 as ref_3; + +select + c_7_c4585_14 as c_4_c4593_5 + from + (select + avg(0) as c_7_c4572_1, + max(-0) as c_7_c4585_14 + from + t7 as ref_0 + group by ref_0.c_2_c46_1) as subq_0 +where c_4_c4593_5 <= multiIf(true, 1, exp10(c_4_c4593_5) <= 1, 1, 1); + +select x as c + from + (select 1 AS k, + max(0) as a, + max(-0) as x + from + t7 GROUP BY k) +where NOT ignore(c); + +SELECT x +FROM +( + SELECT + avg(0) AS c_7_c4572_1, + max(-0) AS x + FROM t7 AS ref_0 + GROUP BY ref_0.c_2_c46_1 +) +WHERE x <= multiIf(true, 1, exp10(x) <= 1, 1, 1); + +DROP TABLE t7; +DROP TABLE t4; From c62558f982366ba2b2bc03cd410bf70840358fcc Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 4 Jun 2023 04:44:51 +0200 Subject: [PATCH 0364/1072] Control memory usage in generateRandom --- src/Storages/StorageGenerateRandom.cpp | 85 ++++++++++++++++++- ...0416_pocopatch_progress_in_http_headers.sh | 1 - 2 files changed, 83 insertions(+), 3 deletions(-) diff --git a/src/Storages/StorageGenerateRandom.cpp b/src/Storages/StorageGenerateRandom.cpp index e48d3187cb2..293beca9c24 100644 --- a/src/Storages/StorageGenerateRandom.cpp +++ b/src/Storages/StorageGenerateRandom.cpp @@ -29,6 +29,7 @@ #include #include +#include #include #include @@ -81,6 +82,66 @@ void fillBufferWithRandomData(char * __restrict data, size_t limit, size_t size_ } +size_t estimateValueSize( + const DataTypePtr type, + UInt64 max_array_length, + UInt64 max_string_length) +{ + if (type->haveMaximumSizeOfValue()) + return type->getMaximumSizeOfValueInMemory(); + + TypeIndex idx = type->getTypeId(); + + switch (idx) + { + case TypeIndex::String: + { + return max_string_length + sizeof(size_t) + 1; + } + + /// The logic in this function should reflect the logic of fillColumnWithRandomData. + case TypeIndex::Array: + { + auto nested_type = typeid_cast(*type).getNestedType(); + return sizeof(size_t) + estimateValueSize(nested_type, max_array_length / 2, max_string_length); + } + + case TypeIndex::Map: + { + const DataTypePtr & nested_type = typeid_cast(*type).getNestedType(); + return sizeof(size_t) + estimateValueSize(nested_type, max_array_length / 2, max_string_length); + } + + case TypeIndex::Tuple: + { + auto elements = typeid_cast(type.get())->getElements(); + const size_t tuple_size = elements.size(); + size_t res = 0; + + for (size_t i = 0; i < tuple_size; ++i) + res += estimateValueSize(elements[i], max_array_length, max_string_length); + + return res; + } + + case TypeIndex::Nullable: + { + auto nested_type = typeid_cast(*type).getNestedType(); + return 1 + estimateValueSize(nested_type, max_array_length, max_string_length); + } + + case TypeIndex::LowCardinality: + { + auto nested_type = typeid_cast(*type).getDictionaryType(); + return sizeof(size_t) + estimateValueSize(nested_type, max_array_length, max_string_length); + } + + default: + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "The 'GenerateRandom' is not implemented for type {}", type->getName()); + } +} + + ColumnPtr fillColumnWithRandomData( const DataTypePtr type, UInt64 limit, @@ -192,7 +253,8 @@ ColumnPtr fillColumnWithRandomData( offsets[i] = offset; } - auto data_column = fillColumnWithRandomData(nested_type, offset, max_array_length, max_string_length, rng, context); + /// This division by two makes the size growth subexponential on depth. + auto data_column = fillColumnWithRandomData(nested_type, offset, max_array_length / 2, max_string_length, rng, context); return ColumnArray::create(data_column, std::move(offsets_column)); } @@ -200,7 +262,7 @@ ColumnPtr fillColumnWithRandomData( case TypeIndex::Map: { const DataTypePtr & nested_type = typeid_cast(*type).getNestedType(); - auto nested_column = fillColumnWithRandomData(nested_type, limit, max_array_length, max_string_length, rng, context); + auto nested_column = fillColumnWithRandomData(nested_type, limit, max_array_length / 2, max_string_length, rng, context); return ColumnMap::create(nested_column); } @@ -597,6 +659,25 @@ Pipe StorageGenerateRandom::read( block_header.insert({std::move(column), name_type.type, name_type.name}); } + /// Correction of block size for wide tables. + size_t preferred_block_size_bytes = context->getSettingsRef().preferred_block_size_bytes; + if (preferred_block_size_bytes) + { + size_t estimated_row_size_bytes = estimateValueSize(std::make_shared(block_header.getDataTypes()), max_array_length, max_string_length); + + size_t estimated_block_size_bytes = 0; + if (common::mulOverflow(max_block_size, estimated_row_size_bytes, estimated_block_size_bytes)) + throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large estimated block size in GenerateRandom table: its estimation leads to 64bit overflow"); + chassert(estimated_block_size_bytes != 0); + + if (estimated_block_size_bytes > preferred_block_size_bytes) + { + max_block_size = static_cast(max_block_size * (static_cast(preferred_block_size_bytes) / estimated_block_size_bytes)); + if (max_block_size == 0) + max_block_size = 1; + } + } + /// Will create more seed values for each source from initial seed. pcg64 generate(random_seed); diff --git a/tests/queries/0_stateless/00416_pocopatch_progress_in_http_headers.sh b/tests/queries/0_stateless/00416_pocopatch_progress_in_http_headers.sh index 6e9814cbca8..b2189ab0cc2 100755 --- a/tests/queries/0_stateless/00416_pocopatch_progress_in_http_headers.sh +++ b/tests/queries/0_stateless/00416_pocopatch_progress_in_http_headers.sh @@ -5,7 +5,6 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . "$CURDIR"/../shell_config.sh ${CLICKHOUSE_CURL} -vsS "${CLICKHOUSE_URL}&max_block_size=5&send_progress_in_http_headers=1&http_headers_progress_interval_ms=0" -d 'SELECT max(number) FROM numbers(10)' 2>&1 | grep -E 'Content-Encoding|X-ClickHouse-Progress|^[0-9]' -# This test will fail with external poco (progress not supported) ${CLICKHOUSE_CURL} -vsS "${CLICKHOUSE_URL}&max_block_size=1&send_progress_in_http_headers=1&http_headers_progress_interval_ms=0&output_format_parallel_formatting=0" -d 'SELECT number FROM numbers(10)' 2>&1 | grep -E 'Content-Encoding|X-ClickHouse-Progress|^[0-9]' ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&max_block_size=1&send_progress_in_http_headers=1&http_headers_progress_interval_ms=0&enable_http_compression=1" -H 'Accept-Encoding: gzip' -d 'SELECT number FROM system.numbers LIMIT 10' | gzip -d From 7ceedbd1806348f3af80d135430db5ca245a341a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 4 Jun 2023 04:45:55 +0200 Subject: [PATCH 0365/1072] Update tests --- tests/queries/0_stateless/02539_generate_random_map.reference | 4 ++-- .../0_stateless/02586_generate_random_structure.reference | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/02539_generate_random_map.reference b/tests/queries/0_stateless/02539_generate_random_map.reference index c0dc175c3cd..25e7f55667e 100644 --- a/tests/queries/0_stateless/02539_generate_random_map.reference +++ b/tests/queries/0_stateless/02539_generate_random_map.reference @@ -1,2 +1,2 @@ -1 -20 +0 +10 diff --git a/tests/queries/0_stateless/02586_generate_random_structure.reference b/tests/queries/0_stateless/02586_generate_random_structure.reference index d2929fb4564..abe168c59e8 100644 --- a/tests/queries/0_stateless/02586_generate_random_structure.reference +++ b/tests/queries/0_stateless/02586_generate_random_structure.reference @@ -13,7 +13,7 @@ c3.e6 Array(Int256) c4 FixedString(183) c5 IPv4 c6 UInt256 -Tb#yV[>M*ܨ(OR8V1n)H}C\'I7tqnV)䳆qLPoRg<{3iH_m!q\'G 127.48.9.45 ['e1V10','e1V0','e1V10','e1V14','e1V10','e1V14'] [{-13:777622572,102:-1122882357,62:1647813163,-94:2094022166},{-32:1448633509},{},{},{34:1536340393,19:-2049677851,74:65643868,-46:-1990799930,97:-531041081,46:-2634833,14:1581632600,89:-771229823,-105:1238603584},{47:1458809010,109:1640682510,86:1945730198,85:1505847247,35:-35189402}] [153363749503.3642648494826450951141750747382772821825909005880434540971999557,79828591186.7378041015337066268618633118713347614941338787453473118807106292,81672688565.9633830721322966111551266731935181670389237071708068971548883315,573768486971.1812413548839655834002608768736215115033958693122764224003897029,-393925092368.4893467278351090742501814120269109477445490969167853713051140487,46027399426.0865278566391382610843315130162915324295037009704113636499519839] [755855942,1804001770,-78103159,-866181765,731736602,-79599206] [5253556148991564114,4681434929596395351,-7302160004580855709,-3686747220178471318,6288582051009949273,646864891160092871] [17035203905051045016266537043565487029724162173062647021612805252288722534904,-42105881403933504641593145676742477006499618886131028341247993701618141933523,45346626822580305846120377917274679004279343244238782744860626882886217433843,-3660165069803677989574889324494857545543653453780976182221584349306428201647,-23316760935816288837287058499520670431785615691220162210524162590241529297823,6184785563808848524970564618169964412151721224362412457508264894603779018817] ڡ|A"x>rwzZ:j8tZD"Tu2h!WIytPa|\'yofFO\0Ֆ6\fIrESacW<~e lT>P3})w%4@_2N"ІXp$^ҘͰ\04@n\b\r4H 16.177.117.209 7882774382721411359365561736453116698030365959050344381263687375357052837130 -Tb#yV[>M*ܨ(OR8V1n)H}C\'I7tqnV)䳆qLPoRg<{3iH_m!q\'G 127.48.9.45 ['e1V10','e1V0','e1V10','e1V14','e1V10','e1V14'] [{-13:777622572,102:-1122882357,62:1647813163,-94:2094022166},{-32:1448633509},{},{},{34:1536340393,19:-2049677851,74:65643868,-46:-1990799930,97:-531041081,46:-2634833,14:1581632600,89:-771229823,-105:1238603584},{47:1458809010,109:1640682510,86:1945730198,85:1505847247,35:-35189402}] [153363749503.3642648494826450951141750747382772821825909005880434540971999557,79828591186.7378041015337066268618633118713347614941338787453473118807106292,81672688565.9633830721322966111551266731935181670389237071708068971548883315,573768486971.1812413548839655834002608768736215115033958693122764224003897029,-393925092368.4893467278351090742501814120269109477445490969167853713051140487,46027399426.0865278566391382610843315130162915324295037009704113636499519839] [755855942,1804001770,-78103159,-866181765,731736602,-79599206] [5253556148991564114,4681434929596395351,-7302160004580855709,-3686747220178471318,6288582051009949273,646864891160092871] [17035203905051045016266537043565487029724162173062647021612805252288722534904,-42105881403933504641593145676742477006499618886131028341247993701618141933523,45346626822580305846120377917274679004279343244238782744860626882886217433843,-3660165069803677989574889324494857545543653453780976182221584349306428201647,-23316760935816288837287058499520670431785615691220162210524162590241529297823,6184785563808848524970564618169964412151721224362412457508264894603779018817] ڡ|A"x>rwzZ:j8tZD"Tu2h!WIytPa|\'yofFO\0Ֆ6\fIrESacW<~e lT>P3})w%4@_2N"ІXp$^ҘͰ\04@n\b\r4H 16.177.117.209 7882774382721411359365561736453116698030365959050344381263687375357052837130 +Tb#yV[>M*ܨ(OR8V1n)H}C\'I7tqnV)䳆qLPoRg<{3iH_m!q\'G 127.48.9.45 ['e1V10','e1V0','e1V10','e1V14','e1V10','e1V14'] [{-13:237920722},{102:1831835481},{},{},{62:-1960618666,-94:-1219892650},{-32:777622572}] [-431683920736.2529974565693898437068698973660186760023819586689844248966618581,59756148020.9162673945900094878560562068677573399624031604279613893604002735,586508082308.9120376291637372686770291666437007489038136467093669000000245094,-542964250950.8072472621084212227729061994250148872708538789242780170470779236,102348230986.0528243362965635624540408239649901816289081513971979913007157412,856260707339.0308016734722383288485766011293159337170215574648625303012038641] [700742145,583743175,-694731143,205377506,-47073316,524775483] [-1523104801326101990,-3813742700509249040,-198797568561120929,1360848130483946585,-2034445041726915230,6445693522245970031] [-42294009754395695750286152418877456026522055877244789929409825751148328749462,-30114637036117781224059264373564439361653611308928312514969460679379590706382,50790691897328530213645537587874962516097580703236937570724165115560305762147,-53982023979693597997184389721196431225054557132927198244547868871851761566844,-4746897427675718862552189488292169089453556841811840545196357111533622948298,-12014200750706123405006110782843469166121588186681927916239818910819762049960] ڡ|A"x>rwzZ:j8tZD"Tu2h!WIytPa|\'yofFO\0Ֆ6\fIrESacW<~e lT>P3})w%4@_2N"ІXp$^ҘͰ\04@n\b\r4H 16.177.117.209 7882774382721411359365561736453116698030365959050344381263687375357052837130 +Tb#yV[>M*ܨ(OR8V1n)H}C\'I7tqnV)䳆qLPoRg<{3iH_m!q\'G 127.48.9.45 ['e1V10','e1V0','e1V10','e1V14','e1V10','e1V14'] [{-13:237920722},{102:1831835481},{},{},{62:-1960618666,-94:-1219892650},{-32:777622572}] [-431683920736.2529974565693898437068698973660186760023819586689844248966618581,59756148020.9162673945900094878560562068677573399624031604279613893604002735,586508082308.9120376291637372686770291666437007489038136467093669000000245094,-542964250950.8072472621084212227729061994250148872708538789242780170470779236,102348230986.0528243362965635624540408239649901816289081513971979913007157412,856260707339.0308016734722383288485766011293159337170215574648625303012038641] [700742145,583743175,-694731143,205377506,-47073316,524775483] [-1523104801326101990,-3813742700509249040,-198797568561120929,1360848130483946585,-2034445041726915230,6445693522245970031] [-42294009754395695750286152418877456026522055877244789929409825751148328749462,-30114637036117781224059264373564439361653611308928312514969460679379590706382,50790691897328530213645537587874962516097580703236937570724165115560305762147,-53982023979693597997184389721196431225054557132927198244547868871851761566844,-4746897427675718862552189488292169089453556841811840545196357111533622948298,-12014200750706123405006110782843469166121588186681927916239818910819762049960] ڡ|A"x>rwzZ:j8tZD"Tu2h!WIytPa|\'yofFO\0Ֆ6\fIrESacW<~e lT>P3})w%4@_2N"ІXp$^ҘͰ\04@n\b\r4H 16.177.117.209 7882774382721411359365561736453116698030365959050344381263687375357052837130 Tb#yV[>M*ܨ(OR8V1n)H}C\'I7tqnV)䳆qLPoRg<{3iH_m!q\'G 127.48.9.45 ['e1V10'] [{}] [825002272867.1157788721157301271303736024856710948164507982705676578804195475] [1865150610] [7514464811443271056] [33504961604882608369857530219353040639899064613284394558131808339620328539033] ڡ|A"x>rwzZ:j8tZD"Tu2h!WIytPa|\'yofFO\0Ֆ6\fIrESacW<~e lT>P3})w%4@_2N"ІXp$^ҘͰ\04@n\b\r4H 16.177.117.209 7882774382721411359365561736453116698030365959050344381263687375357052837130 c1 LowCardinality(Nullable(UInt64)), c2 Date32, c3 LowCardinality(Nullable(Float64)), c4 Int256, c5 Date32 From 53ec091c8d0b8aae36b2ee533f77b6cecc8dadf5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 4 Jun 2023 05:00:29 +0200 Subject: [PATCH 0366/1072] Disable skim (Rust library) under memory sanitizer --- rust/skim/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/rust/skim/CMakeLists.txt b/rust/skim/CMakeLists.txt index 1e7a43aba7c..c2e406ec12f 100644 --- a/rust/skim/CMakeLists.txt +++ b/rust/skim/CMakeLists.txt @@ -14,6 +14,11 @@ if (OS_FREEBSD) return() endif() +if (SANITIZE STREQUAL "memory") + message(STATUS "skim is disabled under memory sanitizer, because the interop is not instrumented properly") + return() +endif() + clickhouse_import_crate(MANIFEST_PATH Cargo.toml) # -Wno-dollar-in-identifier-extension: cxx bridge complies names with '$' From 60c2245da4c1cb23a7e62088b6c409afe941bd56 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 4 Jun 2023 10:58:05 +0200 Subject: [PATCH 0367/1072] Make 01565_query_loop_after_client_error slightly more robust MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI founds one failure of this test [1], which is quite interesting, the test itself should not takes too long: 2023.06.04 05:01:36.889144 [ 42287 ] {} DynamicQueryHandler: Request URI: /?query=SELECT+%27Running+test+stateless%2F01565_query_loop_after_client_error.expect+from+pid%3D1053%27&database=system&connect_timeout=30&receive_timeout=30&send_timeout=30&http_connection_timeout=30&http_receive_timeout=30&http_send_timeout=30&output_format_parallel_formatting=0 ... 2023.06.04 05:01:37.364595 [ 42844 ] {fa79939f-1fa0-4f3b-b599-fd2269122d6a} executeQuery: (from [::1]:40264) (comment: 01565_query_loop_after_client_error.expect) INSERT INTO t01565(c0, c1) VALUES (stage: Complete) 2023.06.04 05:01:37.366485 [ 42844 ] {fa79939f-1fa0-4f3b-b599-fd2269122d6a} TCPHandler: Change cancellation status from NOT_CANCELLED to FULLY_CANCELLED. Log message: Received 'Cancel' packet from the client, canceling the query. 2023.06.04 05:01:37.366810 [ 42844 ] {fa79939f-1fa0-4f3b-b599-fd2269122d6a} TCPHandler: Processed in 0.002539914 sec. But for the last INSERT the client itself works **very slow**, which seems was the reason why "\r" had been sent too early and was never interpreted: send: sending "INSERT INTO t01565(c0, c1) VALUES ('1', 1) ;\n" to { exp4 } expect: does " " (spawn_id exp4) match glob pattern "INSERT"? no I expect: does " \u001b[21GI\u001b[0m\u001b[J\u001b[22G" (spawn_id exp4) match glob pattern "INSERT"? no  expect: does " \u001b[21GI\u001b[0m\u001b[J\u001b[22G\u001b[21G" (spawn_id exp4) match glob pattern "INSERT"? no INSERT INTO t expect: does " \u001b[21GI\u001b[0m\u001b[J\u001b[22G\u001b[21GINSERT INTO t\u001b[0m" (spawn_id exp4) match glob pattern "INSERT"? yes expect: set expect_out(0,string) "INSERT" expect: set expect_out(spawn_id) "exp4" expect: set expect_out(buffer) " \u001b[21GI\u001b[0m\u001b[J\u001b[22G\u001b[21GINSERT" send: sending "\r" to { exp4 } expect: does " INTO t\u001b[0m" (spawn_id exp4) match glob pattern "Ok."? no  expect: does " INTO t\u001b[0m\u001b[J" (spawn_id exp4) match glob pattern "Ok."? no  expect: does " INTO t\u001b[0m\u001b[J\u001b[34G" (spawn_id exp4) match glob pattern "Ok."? no  expect: does " INTO t\u001b[0m\u001b[J\u001b[34G\u001b[21G" (spawn_id exp4) match glob pattern "Ok."? no INSERT INTO t01565(c0, c1) VALUES ('1', 1) ;  expect: does " INTO t\u001b[0m\u001b[J\u001b[34G\u001b[21G\u001b[JINSERT INTO t01565\u001b[0;22;33m(\u001b[0mc0\u001b[0;1m,\u001b[0m c1\u001b[0;22;33m)\u001b[0m VALUES \u001b[0;22;33m(\u001b[0;22;36m'1'\u001b[0;1m,\u001b[0m \u001b[0;22;32m1\u001b[0;22;33m)\u001b[0m \u001b[0;1m;\u001b[0m\r\n\r\n\u001b[0m\u001b[1G" (spawn_id exp4) match glob pattern "Ok."? no expect: timed out Here you can see that it matched "INSERT" and then expect receive data from the client almost byte by byte. So I hope that expecting the last part of the query should fix the problem. [1]: https://s3.amazonaws.com/clickhouse-test-reports/50429/228ebab86db95dca1e29967061d245985bc86a0f/stateless_tests__release__s3_storage__[2_2].html Signed-off-by: Azat Khuzhin --- .../01565_query_loop_after_client_error.expect | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/01565_query_loop_after_client_error.expect b/tests/queries/0_stateless/01565_query_loop_after_client_error.expect index 0faf8f0192b..e718fd99b7f 100755 --- a/tests/queries/0_stateless/01565_query_loop_after_client_error.expect +++ b/tests/queries/0_stateless/01565_query_loop_after_client_error.expect @@ -24,24 +24,24 @@ expect "\n:) " send -- "DROP TABLE IF EXISTS t01565;\n" # NOTE: this is important for -mn mode, you should send "\r" only after reading echoed command -expect "DROP" +expect "\r\n" send -- "\r" expect "\nOk." expect "\n:)" send -- "CREATE TABLE t01565 (c0 String, c1 Int32) ENGINE = Memory() ;\n" -expect "CREATE" +expect "\r\n" send -- "\r" expect "\nOk." expect "\n:) " send -- "INSERT INTO t01565(c0, c1) VALUES (\"1\",1) ;\n" -expect "INSERT" +expect "\r\n" send -- "\r" expect "\n:) " send -- "INSERT INTO t01565(c0, c1) VALUES ('1', 1) ;\n" -expect "INSERT" +expect "\r\n" send -- "\r" expect "\nOk." expect "\n:) " From db806bd394c7b7dfe42f225f3c1ad7b1be1f2ea9 Mon Sep 17 00:00:00 2001 From: auxten Date: Sun, 4 Jun 2023 17:44:29 +0800 Subject: [PATCH 0368/1072] Resize underlying vector only pos_offset == vector.size() --- src/IO/WriteBufferFromVector.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/IO/WriteBufferFromVector.h b/src/IO/WriteBufferFromVector.h index 4b2a3581625..c793a34b406 100644 --- a/src/IO/WriteBufferFromVector.h +++ b/src/IO/WriteBufferFromVector.h @@ -86,7 +86,10 @@ private: size_t old_size = vector.size(); /// pos may not be equal to vector.data() + old_size, because WriteBuffer::next() can be used to flush data size_t pos_offset = pos - reinterpret_cast(vector.data()); - vector.resize(old_size * size_multiplier); + if (pos_offset == vector.size()) + { + vector.resize(old_size * size_multiplier); + } internal_buffer = Buffer(reinterpret_cast(vector.data() + pos_offset), reinterpret_cast(vector.data() + vector.size())); working_buffer = internal_buffer; } From 97bd3f048316b421966b2e4d6ced8258808fdefe Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Mon, 22 May 2023 23:25:59 +0800 Subject: [PATCH 0369/1072] Proper mutation of skip indices and projections --- src/Interpreters/MutationsInterpreter.cpp | 93 ++++- src/Interpreters/MutationsInterpreter.h | 1 + src/Storages/MergeTree/IMergeTreeDataPart.cpp | 25 +- src/Storages/MergeTree/IMergeTreeDataPart.h | 2 +- src/Storages/MergeTree/MergeTreeData.cpp | 14 +- src/Storages/MergeTree/MergeTreeData.h | 6 +- .../MergeTree/MergeTreeDataWriter.cpp | 20 +- src/Storages/MergeTree/MutateTask.cpp | 366 +++++++++--------- src/Storages/StorageInMemoryMetadata.cpp | 16 +- src/Storages/StorageInMemoryMetadata.h | 9 +- ...ith_skip_indices_and_projections.reference | 0 ...part_with_skip_indices_and_projections.sql | 31 ++ 12 files changed, 359 insertions(+), 224 deletions(-) create mode 100644 tests/queries/0_stateless/02763_mutate_compact_part_with_skip_indices_and_projections.reference create mode 100644 tests/queries/0_stateless/02763_mutate_compact_part_with_skip_indices_and_projections.sql diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 713ebade1d5..791018a3f38 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -109,13 +109,16 @@ QueryTreeNodePtr prepareQueryAffectedQueryTree(const std::vector & has_index_or_projection) { NameSet new_updated_columns = updated_columns; ColumnDependencies dependencies; while (!new_updated_columns.empty()) { - auto new_dependencies = metadata_snapshot->getColumnDependencies(new_updated_columns, true); + auto new_dependencies = metadata_snapshot->getColumnDependencies(new_updated_columns, true, has_index_or_projection); new_updated_columns.clear(); for (const auto & dependency : new_dependencies) { @@ -288,6 +291,11 @@ bool MutationsInterpreter::Source::materializeTTLRecalculateOnly() const return data && data->getSettings()->materialize_ttl_recalculate_only; } +bool MutationsInterpreter::Source::hasIndexOrProjection(const String & file_name) const +{ + return part && part->checksums.has(file_name); +} + static Names getAvailableColumnsWithVirtuals(StorageMetadataPtr metadata_snapshot, const IStorage & storage) { auto all_columns = metadata_snapshot->getColumns().getNamesOfPhysical(); @@ -524,8 +532,54 @@ void MutationsInterpreter::prepare(bool dry_run) validateUpdateColumns(source, metadata_snapshot, updated_columns, column_to_affected_materialized); } + for (const auto & [_, names] : column_to_affected_materialized) + updated_columns.insert(names.begin(), names.end()); + + std::function has_index_or_projection + = [&](const String & file_name) { return source.hasIndexOrProjection(file_name); }; + if (settings.recalculate_dependencies_of_updated_columns) - dependencies = getAllColumnDependencies(metadata_snapshot, updated_columns); + dependencies = getAllColumnDependencies(metadata_snapshot, updated_columns, has_index_or_projection); + + for (const auto & index : metadata_snapshot->getSecondaryIndices()) + { + if (source.hasIndexOrProjection("skp_idx_" + index.name + ".idx") || source.hasIndexOrProjection("skp_idx_" + index.name + ".idx2")) + { + // If some dependent columns gets mutated + bool mutate = false; + const auto & index_cols = index.expression->getRequiredColumns(); + for (const auto & col : index_cols) + { + if (updated_columns.contains(col)) + { + mutate = true; + break; + } + } + if (mutate) + materialized_indices.insert(index.name); + } + } + + for (const auto & projection : metadata_snapshot->getProjections()) + { + if (source.hasIndexOrProjection(projection.getDirectoryName())) + { + // If some dependent columns gets mutated + bool mutate = false; + const auto & projection_cols = projection.required_columns; + for (const auto & col : projection_cols) + { + if (updated_columns.contains(col)) + { + mutate = true; + break; + } + } + if (mutate) + materialized_projections.insert(projection.name); + } + } std::vector read_columns; /// First, break a sequence of commands into stages. @@ -680,20 +734,27 @@ void MutationsInterpreter::prepare(bool dry_run) if (it == std::cend(indices_desc)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown index: {}", command.index_name); - auto query = (*it).expression_list_ast->clone(); - auto syntax_result = TreeRewriter(context).analyze(query, all_columns); - const auto required_columns = syntax_result->requiredSourceColumns(); - for (const auto & column : required_columns) - dependencies.emplace(column, ColumnDependency::SKIP_INDEX); - materialized_indices.emplace(command.index_name); + if (!source.hasIndexOrProjection("skp_idx_" + it->name + ".idx") + && !source.hasIndexOrProjection("skp_idx_" + it->name + ".idx2")) + { + auto query = (*it).expression_list_ast->clone(); + auto syntax_result = TreeRewriter(context).analyze(query, all_columns); + const auto required_columns = syntax_result->requiredSourceColumns(); + for (const auto & column : required_columns) + dependencies.emplace(column, ColumnDependency::SKIP_INDEX); + materialized_indices.emplace(command.index_name); + } } else if (command.type == MutationCommand::MATERIALIZE_PROJECTION) { mutation_kind.set(MutationKind::MUTATE_INDEX_PROJECTION); const auto & projection = projections_desc.get(command.projection_name); - for (const auto & column : projection.required_columns) - dependencies.emplace(column, ColumnDependency::PROJECTION); - materialized_projections.emplace(command.projection_name); + if (!source.hasIndexOrProjection(projection.getDirectoryName())) + { + for (const auto & column : projection.required_columns) + dependencies.emplace(column, ColumnDependency::PROJECTION); + materialized_projections.emplace(command.projection_name); + } } else if (command.type == MutationCommand::DROP_INDEX) { @@ -712,7 +773,8 @@ void MutationsInterpreter::prepare(bool dry_run) { // just recalculate ttl_infos without remove expired data auto all_columns_vec = all_columns.getNames(); - auto new_dependencies = metadata_snapshot->getColumnDependencies(NameSet(all_columns_vec.begin(), all_columns_vec.end()), false); + auto new_dependencies = metadata_snapshot->getColumnDependencies( + NameSet(all_columns_vec.begin(), all_columns_vec.end()), false, has_index_or_projection); for (const auto & dependency : new_dependencies) { if (dependency.kind == ColumnDependency::TTL_EXPRESSION) @@ -737,7 +799,8 @@ void MutationsInterpreter::prepare(bool dry_run) } auto all_columns_vec = all_columns.getNames(); - auto all_dependencies = getAllColumnDependencies(metadata_snapshot, NameSet(all_columns_vec.begin(), all_columns_vec.end())); + auto all_dependencies = getAllColumnDependencies( + metadata_snapshot, NameSet(all_columns_vec.begin(), all_columns_vec.end()), has_index_or_projection); for (const auto & dependency : all_dependencies) { @@ -746,7 +809,7 @@ void MutationsInterpreter::prepare(bool dry_run) } /// Recalc only skip indices and projections of columns which could be updated by TTL. - auto new_dependencies = metadata_snapshot->getColumnDependencies(new_updated_columns, true); + auto new_dependencies = metadata_snapshot->getColumnDependencies(new_updated_columns, true, has_index_or_projection); for (const auto & dependency : new_dependencies) { if (dependency.kind == ColumnDependency::SKIP_INDEX || dependency.kind == ColumnDependency::PROJECTION) diff --git a/src/Interpreters/MutationsInterpreter.h b/src/Interpreters/MutationsInterpreter.h index 49ba07641d9..d783b503531 100644 --- a/src/Interpreters/MutationsInterpreter.h +++ b/src/Interpreters/MutationsInterpreter.h @@ -120,6 +120,7 @@ public: bool supportsLightweightDelete() const; bool hasLightweightDeleteMask() const; bool materializeTTLRecalculateOnly() const; + bool hasIndexOrProjection(const String & file_name) const; void read( Stage & first_stage, diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index d27b03fff44..ca814a2afd5 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -632,7 +632,7 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks if (!parent_part) { loadTTLInfos(); - loadProjections(require_columns_checksums, check_consistency); + loadProjections(require_columns_checksums, check_consistency, false /* if_not_loaded */); } if (check_consistency) @@ -690,13 +690,13 @@ void IMergeTreeDataPart::addProjectionPart( const String & projection_name, std::shared_ptr && projection_part) { - /// Here should be a check that projection we are trying to add - /// does not exist, but unfortunately this check fails in tests. - /// TODO: fix. + if (hasProjection(projection_name)) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Projection part {} in part {} is already loaded. This is a bug", projection_name, name); + projection_parts[projection_name] = std::move(projection_part); } -void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool check_consistency) +void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded) { auto metadata_snapshot = storage.getInMemoryMetadataPtr(); for (const auto & projection : metadata_snapshot->projections) @@ -704,9 +704,18 @@ void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool ch auto path = projection.name + ".proj"; if (getDataPartStorage().exists(path)) { - auto part = getProjectionPartBuilder(projection.name).withPartFormatFromDisk().build(); - part->loadColumnsChecksumsIndexes(require_columns_checksums, check_consistency); - addProjectionPart(projection.name, std::move(part)); + if (hasProjection(projection.name)) + { + if (!if_not_loaded) + throw Exception( + ErrorCodes::LOGICAL_ERROR, "Projection part {} in part {} is already loaded. This is a bug", projection.name, name); + } + else + { + auto part = getProjectionPartBuilder(projection.name).withPartFormatFromDisk().build(); + part->loadColumnsChecksumsIndexes(require_columns_checksums, check_consistency); + addProjectionPart(projection.name, std::move(part)); + } } } } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 411de3af982..b6b6d8c6693 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -388,7 +388,7 @@ public: bool hasProjection(const String & projection_name) const { return projection_parts.contains(projection_name); } - void loadProjections(bool require_columns_checksums, bool check_consistency); + void loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded = false); /// Return set of metadata file names without checksums. For example, /// columns.txt or checksums.txt itself. diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 32665429051..0115ce07b2c 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -605,14 +605,14 @@ namespace ExpressionActionsPtr getCombinedIndicesExpression( const KeyDescription & key, - const IndicesDescription & indices, + const MergeTreeIndices & indices, const ColumnsDescription & columns, ContextPtr context) { ASTPtr combined_expr_list = key.expression_list_ast->clone(); for (const auto & index : indices) - for (const auto & index_expr : index.expression_list_ast->children) + for (const auto & index_expr : index->index.expression_list_ast->children) combined_expr_list->children.push_back(index_expr->clone()); auto syntax_result = TreeRewriter(context).analyze(combined_expr_list, columns.getAllPhysical()); @@ -644,14 +644,16 @@ DataTypes MergeTreeData::getMinMaxColumnsTypes(const KeyDescription & partition_ return {}; } -ExpressionActionsPtr MergeTreeData::getPrimaryKeyAndSkipIndicesExpression(const StorageMetadataPtr & metadata_snapshot) const +ExpressionActionsPtr +MergeTreeData::getPrimaryKeyAndSkipIndicesExpression(const StorageMetadataPtr & metadata_snapshot, const MergeTreeIndices & indices) const { - return getCombinedIndicesExpression(metadata_snapshot->getPrimaryKey(), metadata_snapshot->getSecondaryIndices(), metadata_snapshot->getColumns(), getContext()); + return getCombinedIndicesExpression(metadata_snapshot->getPrimaryKey(), indices, metadata_snapshot->getColumns(), getContext()); } -ExpressionActionsPtr MergeTreeData::getSortingKeyAndSkipIndicesExpression(const StorageMetadataPtr & metadata_snapshot) const +ExpressionActionsPtr +MergeTreeData::getSortingKeyAndSkipIndicesExpression(const StorageMetadataPtr & metadata_snapshot, const MergeTreeIndices & indices) const { - return getCombinedIndicesExpression(metadata_snapshot->getSortingKey(), metadata_snapshot->getSecondaryIndices(), metadata_snapshot->getColumns(), getContext()); + return getCombinedIndicesExpression(metadata_snapshot->getSortingKey(), indices, metadata_snapshot->getColumns(), getContext()); } diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 1c41de6fa19..6fd9d223f32 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -956,8 +956,10 @@ public: /// Get column types required for partition key static DataTypes getMinMaxColumnsTypes(const KeyDescription & partition_key); - ExpressionActionsPtr getPrimaryKeyAndSkipIndicesExpression(const StorageMetadataPtr & metadata_snapshot) const; - ExpressionActionsPtr getSortingKeyAndSkipIndicesExpression(const StorageMetadataPtr & metadata_snapshot) const; + ExpressionActionsPtr + getPrimaryKeyAndSkipIndicesExpression(const StorageMetadataPtr & metadata_snapshot, const MergeTreeIndices & indices) const; + ExpressionActionsPtr + getSortingKeyAndSkipIndicesExpression(const StorageMetadataPtr & metadata_snapshot, const MergeTreeIndices & indices) const; /// Get compression codec for part according to TTL rules and /// section from config.xml. diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index db486b163eb..6ff4d6be870 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -398,9 +398,11 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( temp_part.temporary_directory_lock = data.getTemporaryPartDirectoryHolder(part_dir); + auto indices = MergeTreeIndexFactory::instance().getMany(metadata_snapshot->getSecondaryIndices()); + /// If we need to calculate some columns to sort. if (metadata_snapshot->hasSortingKey() || metadata_snapshot->hasSecondaryIndices()) - data.getSortingKeyAndSkipIndicesExpression(metadata_snapshot)->execute(block); + data.getSortingKeyAndSkipIndicesExpression(metadata_snapshot, indices)->execute(block); Names sort_columns = metadata_snapshot->getSortingKeyColumns(); SortDescription sort_description; @@ -517,10 +519,16 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( /// either default lz4 or compression method with zero thresholds on absolute and relative part size. auto compression_codec = data.getContext()->chooseCompressionCodec(0, 0); - const auto & index_factory = MergeTreeIndexFactory::instance(); - auto out = std::make_unique(new_data_part, metadata_snapshot, columns, - index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec, - context->getCurrentTransaction(), false, false, context->getWriteSettings()); + auto out = std::make_unique( + new_data_part, + metadata_snapshot, + columns, + indices, + compression_codec, + context->getCurrentTransaction(), + false, + false, + context->getWriteSettings()); out->writeWithPermutation(block, perm_ptr); @@ -606,7 +614,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( /// If we need to calculate some columns to sort. if (metadata_snapshot->hasSortingKey() || metadata_snapshot->hasSecondaryIndices()) - data.getSortingKeyAndSkipIndicesExpression(metadata_snapshot)->execute(block); + data.getSortingKeyAndSkipIndicesExpression(metadata_snapshot, {})->execute(block); Names sort_columns = metadata_snapshot->getSortingKeyColumns(); SortDescription sort_description; diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 76096d00641..d65897ac97d 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -200,8 +200,7 @@ static void splitAndModifyMutationCommands( { for_file_renames.push_back(command); } - /// If we don't have this column in source part, than we don't need - /// to materialize it + /// If we don't have this column in source part, we don't need to materialize it. else if (part_columns.has(command.column_name)) { if (command.type == MutationCommand::Type::READ_COLUMN) @@ -438,51 +437,13 @@ static ExecuteTTLType shouldExecuteTTL(const StorageMetadataPtr & metadata_snaps } -/// Get skip indices, that should exists in the resulting data part. -static MergeTreeIndices getIndicesForNewDataPart( - const IndicesDescription & all_indices, - const MutationCommands & commands_for_removes) -{ - NameSet removed_indices; - for (const auto & command : commands_for_removes) - if (command.type == MutationCommand::DROP_INDEX) - removed_indices.insert(command.column_name); - - MergeTreeIndices new_indices; - for (const auto & index : all_indices) - if (!removed_indices.contains(index.name)) - new_indices.push_back(MergeTreeIndexFactory::instance().get(index)); - - return new_indices; -} - -static std::vector getProjectionsForNewDataPart( - const ProjectionsDescription & all_projections, - const MutationCommands & commands_for_removes) -{ - NameSet removed_projections; - for (const auto & command : commands_for_removes) - if (command.type == MutationCommand::DROP_PROJECTION) - removed_projections.insert(command.column_name); - - std::vector new_projections; - for (const auto & projection : all_projections) - if (!removed_projections.contains(projection.name)) - new_projections.push_back(&projection); - - return new_projections; -} - - /// Return set of indices which should be recalculated during mutation also /// wraps input stream into additional expression stream static std::set getIndicesToRecalculate( QueryPipelineBuilder & builder, - const NameSet & updated_columns, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, - const NameSet & materialized_indices, - const MergeTreeData::DataPartPtr & source_part) + const NameSet & materialized_indices) { /// Checks if columns used in skipping indexes modified. const auto & index_factory = MergeTreeIndexFactory::instance(); @@ -492,11 +453,7 @@ static std::set getIndicesToRecalculate( for (const auto & index : indices) { - bool has_index = - source_part->checksums.has(INDEX_FILE_PREFIX + index.name + ".idx") || - source_part->checksums.has(INDEX_FILE_PREFIX + index.name + ".idx2"); - // If we ask to materialize and it already exists - if (!has_index && materialized_indices.contains(index.name)) + if (materialized_indices.contains(index.name)) { if (indices_to_recalc.insert(index_factory.get(index)).second) { @@ -505,26 +462,6 @@ static std::set getIndicesToRecalculate( indices_recalc_expr_list->children.push_back(expr->clone()); } } - // If some dependent columns gets mutated - else - { - bool mutate = false; - const auto & index_cols = index.expression->getRequiredColumns(); - for (const auto & col : index_cols) - { - if (updated_columns.contains(col)) - { - mutate = true; - break; - } - } - if (mutate && indices_to_recalc.insert(index_factory.get(index)).second) - { - ASTPtr expr_list = index.expression_list_ast->clone(); - for (const auto & expr : expr_list->children) - indices_recalc_expr_list->children.push_back(expr->clone()); - } - } } if (!indices_to_recalc.empty() && builder.initialized()) @@ -545,37 +482,15 @@ static std::set getIndicesToRecalculate( return indices_to_recalc; } -std::set getProjectionsToRecalculate( - const NameSet & updated_columns, +static std::set getProjectionsToRecalculate( const StorageMetadataPtr & metadata_snapshot, - const NameSet & materialized_projections, - const MergeTreeData::DataPartPtr & source_part) + const NameSet & materialized_projections) { - /// Checks if columns used in projections modified. std::set projections_to_recalc; for (const auto & projection : metadata_snapshot->getProjections()) { - // If we ask to materialize and it doesn't exist - if (!source_part->checksums.has(projection.name + ".proj") && materialized_projections.contains(projection.name)) - { + if (materialized_projections.contains(projection.name)) projections_to_recalc.insert(&projection); - } - else - { - // If some dependent columns gets mutated - bool mutate = false; - const auto & projection_cols = projection.required_columns; - for (const auto & col : projection_cols) - { - if (updated_columns.contains(col)) - { - mutate = true; - break; - } - } - if (mutate) - projections_to_recalc.insert(&projection); - } } return projections_to_recalc; } @@ -618,33 +533,6 @@ static NameSet collectFilesToSkip( /// Do not hardlink this file because it's always rewritten at the end of mutation. files_to_skip.insert(IMergeTreeDataPart::SERIALIZATION_FILE_NAME); - auto new_stream_counts = getStreamCounts(new_part, new_part->getColumns().getNames()); - auto source_updated_stream_counts = getStreamCounts(source_part, updated_header.getNames()); - auto new_updated_stream_counts = getStreamCounts(new_part, updated_header.getNames()); - - /// Skip all modified files in new part. - for (const auto & [stream_name, _] : new_updated_stream_counts) - { - files_to_skip.insert(stream_name + ".bin"); - files_to_skip.insert(stream_name + mrk_extension); - } - - /// Skip files that we read from source part and do not write in new part. - /// E.g. ALTER MODIFY from LowCardinality(String) to String. - for (const auto & [stream_name, _] : source_updated_stream_counts) - { - /// If we read shared stream and do not write it - /// (e.g. while ALTER MODIFY COLUMN from array of Nested type to String), - /// we need to hardlink its files, because they will be lost otherwise. - bool need_hardlink = new_updated_stream_counts[stream_name] == 0 && new_stream_counts[stream_name] != 0; - - if (!need_hardlink) - { - files_to_skip.insert(stream_name + ".bin"); - files_to_skip.insert(stream_name + mrk_extension); - } - } - for (const auto & index : indices_to_recalc) { /// Since MinMax index has .idx2 extension, we need to add correct extension. @@ -655,6 +543,36 @@ static NameSet collectFilesToSkip( for (const auto & projection : projections_to_recalc) files_to_skip.insert(projection->getDirectoryName()); + if (isWidePart(source_part)) + { + auto new_stream_counts = getStreamCounts(new_part, new_part->getColumns().getNames()); + auto source_updated_stream_counts = getStreamCounts(source_part, updated_header.getNames()); + auto new_updated_stream_counts = getStreamCounts(new_part, updated_header.getNames()); + + /// Skip all modified files in new part. + for (const auto & [stream_name, _] : new_updated_stream_counts) + { + files_to_skip.insert(stream_name + ".bin"); + files_to_skip.insert(stream_name + mrk_extension); + } + + /// Skip files that we read from source part and do not write in new part. + /// E.g. ALTER MODIFY from LowCardinality(String) to String. + for (const auto & [stream_name, _] : source_updated_stream_counts) + { + /// If we read shared stream and do not write it + /// (e.g. while ALTER MODIFY COLUMN from array of Nested type to String), + /// we need to hardlink its files, because they will be lost otherwise. + bool need_hardlink = new_updated_stream_counts[stream_name] == 0 && new_stream_counts[stream_name] != 0; + + if (!need_hardlink) + { + files_to_skip.insert(stream_name + ".bin"); + files_to_skip.insert(stream_name + mrk_extension); + } + } + } + return files_to_skip; } @@ -701,57 +619,60 @@ static NameToNameVector collectFilesForRenames( if (source_part->checksums.has(command.column_name + ".proj")) add_rename(command.column_name + ".proj", ""); } - else if (command.type == MutationCommand::Type::DROP_COLUMN) + else if (isWidePart(source_part)) { - ISerialization::StreamCallback callback = [&](const ISerialization::SubstreamPath & substream_path) + if (command.type == MutationCommand::Type::DROP_COLUMN) { - String stream_name = ISerialization::getFileNameForStream({command.column_name, command.data_type}, substream_path); - /// Delete files if they are no longer shared with another column. - if (--stream_counts[stream_name] == 0) + ISerialization::StreamCallback callback = [&](const ISerialization::SubstreamPath & substream_path) { - add_rename(stream_name + ".bin", ""); - add_rename(stream_name + mrk_extension, ""); - } - }; + String stream_name = ISerialization::getFileNameForStream({command.column_name, command.data_type}, substream_path); + /// Delete files if they are no longer shared with another column. + if (--stream_counts[stream_name] == 0) + { + add_rename(stream_name + ".bin", ""); + add_rename(stream_name + mrk_extension, ""); + } + }; - if (auto serialization = source_part->tryGetSerialization(command.column_name)) - serialization->enumerateStreams(callback); - } - else if (command.type == MutationCommand::Type::RENAME_COLUMN) - { - String escaped_name_from = escapeForFileName(command.column_name); - String escaped_name_to = escapeForFileName(command.rename_to); - - ISerialization::StreamCallback callback = [&](const ISerialization::SubstreamPath & substream_path) + if (auto serialization = source_part->tryGetSerialization(command.column_name)) + serialization->enumerateStreams(callback); + } + else if (command.type == MutationCommand::Type::RENAME_COLUMN) { - String stream_from = ISerialization::getFileNameForStream(command.column_name, substream_path); - String stream_to = boost::replace_first_copy(stream_from, escaped_name_from, escaped_name_to); + String escaped_name_from = escapeForFileName(command.column_name); + String escaped_name_to = escapeForFileName(command.rename_to); - if (stream_from != stream_to) + ISerialization::StreamCallback callback = [&](const ISerialization::SubstreamPath & substream_path) { - add_rename(stream_from + ".bin", stream_to + ".bin"); - add_rename(stream_from + mrk_extension, stream_to + mrk_extension); - } - }; + String stream_from = ISerialization::getFileNameForStream(command.column_name, substream_path); + String stream_to = boost::replace_first_copy(stream_from, escaped_name_from, escaped_name_to); - if (auto serialization = source_part->tryGetSerialization(command.column_name)) - serialization->enumerateStreams(callback); - } - else if (command.type == MutationCommand::Type::READ_COLUMN) - { - /// Remove files for streams that exist in source_part, - /// but were removed in new_part by MODIFY COLUMN from - /// type with higher number of streams (e.g. LowCardinality -> String). + if (stream_from != stream_to) + { + add_rename(stream_from + ".bin", stream_to + ".bin"); + add_rename(stream_from + mrk_extension, stream_to + mrk_extension); + } + }; - auto old_streams = getStreamCounts(source_part, source_part->getColumns().getNames()); - auto new_streams = getStreamCounts(new_part, source_part->getColumns().getNames()); - - for (const auto & [old_stream, _] : old_streams) + if (auto serialization = source_part->tryGetSerialization(command.column_name)) + serialization->enumerateStreams(callback); + } + else if (command.type == MutationCommand::Type::READ_COLUMN) { - if (!new_streams.contains(old_stream) && --stream_counts[old_stream] == 0) + /// Remove files for streams that exist in source_part, + /// but were removed in new_part by MODIFY COLUMN from + /// type with higher number of streams (e.g. LowCardinality -> String). + + auto old_streams = getStreamCounts(source_part, source_part->getColumns().getNames()); + auto new_streams = getStreamCounts(new_part, source_part->getColumns().getNames()); + + for (const auto & [old_stream, _] : old_streams) { - add_rename(old_stream + ".bin", ""); - add_rename(old_stream + mrk_extension, ""); + if (!new_streams.contains(old_stream) && --stream_counts[old_stream] == 0) + { + add_rename(old_stream + ".bin", ""); + add_rename(old_stream + mrk_extension, ""); + } } } } @@ -851,11 +772,8 @@ void finalizeMutatedPart( new_data_part->minmax_idx = source_part->minmax_idx; new_data_part->modification_time = time(nullptr); - /// This line should not be here because at that moment - /// of executing of mutation all projections should be loaded. - /// But unfortunately without it some tests fail. - /// TODO: fix. - new_data_part->loadProjections(false, false); + /// Load rest projections which are hardlinked + new_data_part->loadProjections(false, false, true /* if_not_loaded */); /// All information about sizes is stored in checksums. /// It doesn't make sense to touch filesystem for sizes. @@ -917,9 +835,9 @@ struct MutationContext std::vector projections_to_build; IMergeTreeDataPart::MinMaxIndexPtr minmax_idx{nullptr}; - NameSet updated_columns; std::set indices_to_recalc; std::set projections_to_recalc; + MergeTreeData::DataPart::Checksums existing_indices_checksums; NameSet files_to_skip; NameToNameVector files_to_rename; @@ -1331,10 +1249,102 @@ private: /// (which is locked in shared mode when input streams are created) and when inserting new data /// the order is reverse. This annoys TSan even though one lock is locked in shared mode and thus /// deadlock is impossible. - ctx->compression_codec = ctx->data->getCompressionCodecForPart(ctx->source_part->getBytesOnDisk(), ctx->source_part->ttl_infos, ctx->time_of_mutation); + ctx->compression_codec + = ctx->data->getCompressionCodecForPart(ctx->source_part->getBytesOnDisk(), ctx->source_part->ttl_infos, ctx->time_of_mutation); - auto skip_part_indices = MutationHelpers::getIndicesForNewDataPart(ctx->metadata_snapshot->getSecondaryIndices(), ctx->for_file_renames); - ctx->projections_to_build = MutationHelpers::getProjectionsForNewDataPart(ctx->metadata_snapshot->getProjections(), ctx->for_file_renames); + NameSet entries_to_hardlink; + + NameSet removed_indices; + for (const auto & command : ctx->for_file_renames) + { + if (command.type == MutationCommand::DROP_INDEX) + removed_indices.insert(command.column_name); + } + + const auto & indices = ctx->metadata_snapshot->getSecondaryIndices(); + MergeTreeIndices skip_indices; + for (const auto & idx : indices) + { + if (removed_indices.contains(idx.name)) + continue; + + if (ctx->materialized_indices.contains(idx.name)) + skip_indices.push_back(MergeTreeIndexFactory::instance().get(idx)); + + auto hardlink_index = [&](const String & idx_name) + { + if (ctx->source_part->checksums.has(idx_name)) + { + auto it = ctx->source_part->checksums.files.find(idx_name); + if (it != ctx->source_part->checksums.files.end()) + { + entries_to_hardlink.insert(idx_name); + ctx->existing_indices_checksums.addFile(idx_name, it->second.file_size, it->second.file_hash); + } + } + }; + hardlink_index(INDEX_FILE_PREFIX + idx.name + ".idx"); + hardlink_index(INDEX_FILE_PREFIX + idx.name + ".idx2"); + } + + NameSet removed_projections; + for (const auto & command : ctx->for_file_renames) + { + if (command.type == MutationCommand::DROP_PROJECTION) + removed_projections.insert(command.column_name); + } + + const auto & projections = ctx->metadata_snapshot->getProjections(); + for (const auto & projection : projections) + { + if (removed_projections.contains(projection.name)) + continue; + + if (ctx->materialized_projections.contains(projection.name)) + ctx->projections_to_build.push_back(&projection); + + if (ctx->source_part->checksums.has(projection.getDirectoryName())) + entries_to_hardlink.insert(projection.getDirectoryName()); + } + + NameSet hardlinked_files; + /// Create hardlinks for unchanged files + for (auto it = ctx->source_part->getDataPartStorage().iterate(); it->isValid(); it->next()) + { + if (!entries_to_hardlink.contains(it->name())) + continue; + + if (it->isFile()) + { + ctx->new_data_part->getDataPartStorage().createHardLinkFrom( + ctx->source_part->getDataPartStorage(), it->name(), it->name()); + hardlinked_files.insert(it->name()); + } + else + { + // it's a projection part directory + ctx->new_data_part->getDataPartStorage().createProjection(it->name()); + + auto projection_data_part_storage_src = ctx->source_part->getDataPartStorage().getProjection(it->name()); + auto projection_data_part_storage_dst = ctx->new_data_part->getDataPartStorage().getProjection(it->name()); + + for (auto p_it = projection_data_part_storage_src->iterate(); p_it->isValid(); p_it->next()) + { + projection_data_part_storage_dst->createHardLinkFrom( + *projection_data_part_storage_src, p_it->name(), p_it->name()); + + auto file_name_with_projection_prefix = fs::path(projection_data_part_storage_src->getPartDirectory()) / p_it->name(); + hardlinked_files.insert(file_name_with_projection_prefix); + } + } + } + + /// Tracking of hardlinked files required for zero-copy replication. + /// We don't remove them when we delete last copy of source part because + /// new part can use them. + ctx->hardlinked_files.source_table_shared_id = ctx->source_part->storage.getTableSharedID(); + ctx->hardlinked_files.source_part_name = ctx->source_part->name; + ctx->hardlinked_files.hardlinks_from_source_part = std::move(hardlinked_files); if (!ctx->mutating_pipeline_builder.initialized()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot mutate part columns with uninitialized mutations stream. It's a bug"); @@ -1343,8 +1353,8 @@ private: if (ctx->metadata_snapshot->hasPrimaryKey() || ctx->metadata_snapshot->hasSecondaryIndices()) { - builder.addTransform( - std::make_shared(builder.getHeader(), ctx->data->getPrimaryKeyAndSkipIndicesExpression(ctx->metadata_snapshot))); + builder.addTransform(std::make_shared( + builder.getHeader(), ctx->data->getPrimaryKeyAndSkipIndicesExpression(ctx->metadata_snapshot, skip_indices))); builder.addTransform(std::make_shared(builder.getHeader())); } @@ -1361,7 +1371,7 @@ private: ctx->new_data_part, ctx->metadata_snapshot, ctx->new_data_part->getColumns(), - skip_part_indices, + skip_indices, ctx->compression_codec, ctx->txn, /*reset_columns=*/ true, @@ -1381,10 +1391,12 @@ private: void finalize() { ctx->new_data_part->minmax_idx = std::move(ctx->minmax_idx); + ctx->new_data_part->loadProjections(false, false, true /* if_not_loaded */); ctx->mutating_executor.reset(); ctx->mutating_pipeline.reset(); - static_pointer_cast(ctx->out)->finalizePart(ctx->new_data_part, ctx->need_sync); + static_pointer_cast(ctx->out)->finalizePart( + ctx->new_data_part, ctx->need_sync, nullptr, &ctx->existing_indices_checksums); ctx->out.reset(); } @@ -1530,7 +1542,7 @@ private: /// new part can use them. ctx->hardlinked_files.source_table_shared_id = ctx->source_part->storage.getTableSharedID(); ctx->hardlinked_files.source_part_name = ctx->source_part->name; - ctx->hardlinked_files.hardlinks_from_source_part = hardlinked_files; + ctx->hardlinked_files.hardlinks_from_source_part = std::move(hardlinked_files); (*ctx->mutate_entry)->columns_written = ctx->storage_columns.size() - ctx->updated_header.columns(); @@ -1878,14 +1890,10 @@ bool MutateTask::prepare() } else /// TODO: check that we modify only non-key columns in this case. { - /// We will modify only some of the columns. Other columns and key values can be copied as-is. - for (const auto & name_type : ctx->updated_header.getNamesAndTypesList()) - ctx->updated_columns.emplace(name_type.name); - ctx->indices_to_recalc = MutationHelpers::getIndicesToRecalculate( - ctx->mutating_pipeline_builder, ctx->updated_columns, ctx->metadata_snapshot, ctx->context, ctx->materialized_indices, ctx->source_part); - ctx->projections_to_recalc = MutationHelpers::getProjectionsToRecalculate( - ctx->updated_columns, ctx->metadata_snapshot, ctx->materialized_projections, ctx->source_part); + ctx->mutating_pipeline_builder, ctx->metadata_snapshot, ctx->context, ctx->materialized_indices); + + ctx->projections_to_recalc = MutationHelpers::getProjectionsToRecalculate(ctx->metadata_snapshot, ctx->materialized_projections); ctx->files_to_skip = MutationHelpers::collectFilesToSkip( ctx->source_part, diff --git a/src/Storages/StorageInMemoryMetadata.cpp b/src/Storages/StorageInMemoryMetadata.cpp index 45abd4bebef..afe75349864 100644 --- a/src/Storages/StorageInMemoryMetadata.cpp +++ b/src/Storages/StorageInMemoryMetadata.cpp @@ -236,7 +236,10 @@ bool StorageInMemoryMetadata::hasAnyGroupByTTL() const return !table_ttl.group_by_ttl.empty(); } -ColumnDependencies StorageInMemoryMetadata::getColumnDependencies(const NameSet & updated_columns, bool include_ttl_target) const +ColumnDependencies StorageInMemoryMetadata::getColumnDependencies( + const NameSet & updated_columns, + bool include_ttl_target, + const std::function & has_indice_or_projection) const { if (updated_columns.empty()) return {}; @@ -264,10 +267,16 @@ ColumnDependencies StorageInMemoryMetadata::getColumnDependencies(const NameSet }; for (const auto & index : getSecondaryIndices()) - add_dependent_columns(index.expression, indices_columns); + { + if (has_indice_or_projection("skp_idx_" + index.name + ".idx") || has_indice_or_projection("skp_idx_" + index.name + ".idx2")) + add_dependent_columns(index.expression, indices_columns); + } for (const auto & projection : getProjections()) - add_dependent_columns(&projection, projections_columns); + { + if (has_indice_or_projection(projection.getDirectoryName())) + add_dependent_columns(&projection, projections_columns); + } auto add_for_rows_ttl = [&](const auto & expression, auto & to_set) { @@ -312,7 +321,6 @@ ColumnDependencies StorageInMemoryMetadata::getColumnDependencies(const NameSet res.emplace(column, ColumnDependency::TTL_TARGET); return res; - } Block StorageInMemoryMetadata::getSampleBlockInsertable() const diff --git a/src/Storages/StorageInMemoryMetadata.h b/src/Storages/StorageInMemoryMetadata.h index 25618c5b03f..4ed7eb8bf29 100644 --- a/src/Storages/StorageInMemoryMetadata.h +++ b/src/Storages/StorageInMemoryMetadata.h @@ -147,9 +147,12 @@ struct StorageInMemoryMetadata TTLDescriptions getGroupByTTLs() const; bool hasAnyGroupByTTL() const; - /// Returns columns, which will be needed to calculate dependencies (skip - /// indices, TTL expressions) if we update @updated_columns set of columns. - ColumnDependencies getColumnDependencies(const NameSet & updated_columns, bool include_ttl_target) const; + /// Returns columns, which will be needed to calculate dependencies (skip indices, projections, + /// TTL expressions) if we update @updated_columns set of columns. + ColumnDependencies getColumnDependencies( + const NameSet & updated_columns, + bool include_ttl_target, + const std::function & has_indice_or_projection) const; /// Block with ordinary + materialized columns. Block getSampleBlock() const; diff --git a/tests/queries/0_stateless/02763_mutate_compact_part_with_skip_indices_and_projections.reference b/tests/queries/0_stateless/02763_mutate_compact_part_with_skip_indices_and_projections.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02763_mutate_compact_part_with_skip_indices_and_projections.sql b/tests/queries/0_stateless/02763_mutate_compact_part_with_skip_indices_and_projections.sql new file mode 100644 index 00000000000..bb9825fe5a0 --- /dev/null +++ b/tests/queries/0_stateless/02763_mutate_compact_part_with_skip_indices_and_projections.sql @@ -0,0 +1,31 @@ +DROP TABLE IF EXISTS test; + +CREATE TABLE test ( col1 Int64, dt Date ) ENGINE = MergeTree PARTITION BY dt ORDER BY tuple(); + +INSERT INTO test FORMAT Values (1, today()); + +ALTER TABLE test ADD COLUMN col2 String; + +ALTER TABLE test ADD INDEX i1 (col1, col2) TYPE set(100) GRANULARITY 1; + +ALTER TABLE test MATERIALIZE INDEX i1; + +ALTER TABLE test ADD COLUMN col3 String; + +ALTER TABLE test DROP COLUMN col3; + +DROP TABLE IF EXISTS test; + +CREATE TABLE test ( col1 Int64, dt Date ) ENGINE = MergeTree PARTITION BY dt ORDER BY tuple(); + +INSERT INTO test FORMAT Values (1, today()); + +ALTER TABLE test ADD COLUMN col2 String; + +ALTER TABLE test ADD PROJECTION p1 ( SELECT col2, sum(col1) GROUP BY col2 ); + +ALTER TABLE test MATERIALIZE PROJECTION p1; + +ALTER TABLE test ADD COLUMN col3 String; + +ALTER TABLE test DROP COLUMN col3; From 92b2200c55f27eefceb610535e813e62bd49ffce Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Tue, 23 May 2023 20:46:50 +0800 Subject: [PATCH 0370/1072] mutation stages can be empty --- src/Interpreters/MutationsInterpreter.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 791018a3f38..1059e1fdae5 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -901,6 +901,11 @@ void MutationsInterpreter::prepare(bool dry_run) } } + /// Stages might be empty when we materialize skip indices or projections which don't add any + /// column dependencies. + if (stages.empty()) + stages.emplace_back(context); + is_prepared = true; prepareMutationStages(stages, dry_run); } From 24e015b961189e1b2202cfdf5fefe6c9e5904e71 Mon Sep 17 00:00:00 2001 From: kssenii Date: Sun, 4 Jun 2023 13:34:20 +0200 Subject: [PATCH 0371/1072] Fix --- src/Interpreters/Cache/FileCache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp index 8a24a4fe5ee..7afd17d32d1 100644 --- a/src/Interpreters/Cache/FileCache.cpp +++ b/src/Interpreters/Cache/FileCache.cpp @@ -650,7 +650,7 @@ bool FileCache::tryReserve(FileSegment & file_segment, const size_t size) } ProfileEvents::increment(ProfileEvents::FilesystemCacheEvictedFileSegments); - ProfileEvents::increment(ProfileEvents::FilesystemCacheEvictedBytes, segment->range().size()); + ProfileEvents::increment(ProfileEvents::FilesystemCacheEvictedBytes, segment->getDownloadedSize(false)); locked_key.removeFileSegment(segment->offset(), segment->lock()); return PriorityIterationResult::REMOVE_AND_CONTINUE; From 46cbdeeb7e7975b25de35fb75912da3a7ece21ec Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Sun, 4 Jun 2023 15:02:46 +0300 Subject: [PATCH 0372/1072] Fixed tests --- src/Functions/if.cpp | 6 ++---- ...olding.reference => 02771_if_constant_folding.reference} | 0 ...f_constant_folding.sql => 02771_if_constant_folding.sql} | 0 3 files changed, 2 insertions(+), 4 deletions(-) rename tests/queries/0_stateless/{25337_if_constant_folding.reference => 02771_if_constant_folding.reference} (100%) rename tests/queries/0_stateless/{25337_if_constant_folding.sql => 02771_if_constant_folding.sql} (100%) diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index d00e83c4eb7..8d43b3a4ca3 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -1120,9 +1120,8 @@ public: ColumnPtr getConstantResultForNonConstArguments(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type) const override { const ColumnWithTypeAndName & arg_cond = arguments[0]; - if (!arg_cond.column || !isColumnConst(*arg_cond.column)) { + if (!arg_cond.column || !isColumnConst(*arg_cond.column)) return {}; - } const ColumnConst * cond_const_col = checkAndGetColumnConst>(arg_cond.column.get()); bool condition_value = cond_const_col->getValue(); @@ -1135,9 +1134,8 @@ public: return {}; auto result = castColumn(potential_const_column, result_type); - if (!isColumnConst(*result)) { + if (!isColumnConst(*result)) return {}; - } return result; } diff --git a/tests/queries/0_stateless/25337_if_constant_folding.reference b/tests/queries/0_stateless/02771_if_constant_folding.reference similarity index 100% rename from tests/queries/0_stateless/25337_if_constant_folding.reference rename to tests/queries/0_stateless/02771_if_constant_folding.reference diff --git a/tests/queries/0_stateless/25337_if_constant_folding.sql b/tests/queries/0_stateless/02771_if_constant_folding.sql similarity index 100% rename from tests/queries/0_stateless/25337_if_constant_folding.sql rename to tests/queries/0_stateless/02771_if_constant_folding.sql From e24c9267bc12026ff3acde78675464a17f6d5cc1 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Sun, 4 Jun 2023 20:06:27 +0800 Subject: [PATCH 0373/1072] fix --- src/Interpreters/MutationsInterpreter.cpp | 68 +++++++++-------------- src/Storages/MergeTree/MutateTask.cpp | 39 +++++++------ 2 files changed, 47 insertions(+), 60 deletions(-) diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 1059e1fdae5..25bb3fc5e82 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -532,55 +532,12 @@ void MutationsInterpreter::prepare(bool dry_run) validateUpdateColumns(source, metadata_snapshot, updated_columns, column_to_affected_materialized); } - for (const auto & [_, names] : column_to_affected_materialized) - updated_columns.insert(names.begin(), names.end()); - std::function has_index_or_projection = [&](const String & file_name) { return source.hasIndexOrProjection(file_name); }; if (settings.recalculate_dependencies_of_updated_columns) dependencies = getAllColumnDependencies(metadata_snapshot, updated_columns, has_index_or_projection); - for (const auto & index : metadata_snapshot->getSecondaryIndices()) - { - if (source.hasIndexOrProjection("skp_idx_" + index.name + ".idx") || source.hasIndexOrProjection("skp_idx_" + index.name + ".idx2")) - { - // If some dependent columns gets mutated - bool mutate = false; - const auto & index_cols = index.expression->getRequiredColumns(); - for (const auto & col : index_cols) - { - if (updated_columns.contains(col)) - { - mutate = true; - break; - } - } - if (mutate) - materialized_indices.insert(index.name); - } - } - - for (const auto & projection : metadata_snapshot->getProjections()) - { - if (source.hasIndexOrProjection(projection.getDirectoryName())) - { - // If some dependent columns gets mutated - bool mutate = false; - const auto & projection_cols = projection.required_columns; - for (const auto & col : projection_cols) - { - if (updated_columns.contains(col)) - { - mutate = true; - break; - } - } - if (mutate) - materialized_projections.insert(projection.name); - } - } - std::vector read_columns; /// First, break a sequence of commands into stages. for (auto & command : commands) @@ -869,6 +826,31 @@ void MutationsInterpreter::prepare(bool dry_run) for (const auto & column : changed_columns) stages.back().column_to_updated.emplace( column, std::make_shared(column)); + + for (const auto & index : metadata_snapshot->getSecondaryIndices()) + { + if (source.hasIndexOrProjection("skp_idx_" + index.name + ".idx") + || source.hasIndexOrProjection("skp_idx_" + index.name + ".idx2")) + { + const auto & index_cols = index.expression->getRequiredColumns(); + bool changed = std::any_of( + index_cols.begin(), index_cols.end(), [&](const auto & col) { return changed_columns.contains(col); }); + if (changed) + materialized_indices.insert(index.name); + } + } + + for (const auto & projection : metadata_snapshot->getProjections()) + { + if (source.hasIndexOrProjection(projection.getDirectoryName())) + { + const auto & projection_cols = projection.required_columns; + bool changed = std::any_of( + projection_cols.begin(), projection_cols.end(), [&](const auto & col) { return changed_columns.contains(col); }); + if (changed) + materialized_projections.insert(projection.name); + } + } } if (!unchanged_columns.empty()) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index d65897ac97d..7031027002d 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1269,22 +1269,23 @@ private: continue; if (ctx->materialized_indices.contains(idx.name)) - skip_indices.push_back(MergeTreeIndexFactory::instance().get(idx)); - - auto hardlink_index = [&](const String & idx_name) { - if (ctx->source_part->checksums.has(idx_name)) + skip_indices.push_back(MergeTreeIndexFactory::instance().get(idx)); + } + else + { + auto prefix = fmt::format("{}{}.", INDEX_FILE_PREFIX, idx.name); + auto it = ctx->source_part->checksums.files.upper_bound(prefix); + while (it != ctx->source_part->checksums.files.end()) { - auto it = ctx->source_part->checksums.files.find(idx_name); - if (it != ctx->source_part->checksums.files.end()) - { - entries_to_hardlink.insert(idx_name); - ctx->existing_indices_checksums.addFile(idx_name, it->second.file_size, it->second.file_hash); - } + if (!startsWith(it->first, prefix)) + break; + + entries_to_hardlink.insert(it->first); + ctx->existing_indices_checksums.addFile(it->first, it->second.file_size, it->second.file_hash); + ++it; } - }; - hardlink_index(INDEX_FILE_PREFIX + idx.name + ".idx"); - hardlink_index(INDEX_FILE_PREFIX + idx.name + ".idx2"); + } } NameSet removed_projections; @@ -1301,10 +1302,14 @@ private: continue; if (ctx->materialized_projections.contains(projection.name)) + { ctx->projections_to_build.push_back(&projection); - - if (ctx->source_part->checksums.has(projection.getDirectoryName())) - entries_to_hardlink.insert(projection.getDirectoryName()); + } + else + { + if (ctx->source_part->checksums.has(projection.getDirectoryName())) + entries_to_hardlink.insert(projection.getDirectoryName()); + } } NameSet hardlinked_files; @@ -1354,7 +1359,7 @@ private: if (ctx->metadata_snapshot->hasPrimaryKey() || ctx->metadata_snapshot->hasSecondaryIndices()) { builder.addTransform(std::make_shared( - builder.getHeader(), ctx->data->getPrimaryKeyAndSkipIndicesExpression(ctx->metadata_snapshot, skip_indices))); + builder.getHeader(), ctx->data->getPrimaryKeyAndSkipIndicesExpression(ctx->metadata_snapshot, skip_indices))); builder.addTransform(std::make_shared(builder.getHeader())); } From 63eab2783e4e0c23fdbe3e5e233b4e3c0773f40a Mon Sep 17 00:00:00 2001 From: Sergey Kazmin <43613813+yerseg@users.noreply.github.com> Date: Fri, 2 Jun 2023 17:06:19 +0000 Subject: [PATCH 0374/1072] Make typeid_cast for pointers noexcept --- src/Common/typeid_cast.h | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/src/Common/typeid_cast.h b/src/Common/typeid_cast.h index baee3aaf632..e134aac09e4 100644 --- a/src/Common/typeid_cast.h +++ b/src/Common/typeid_cast.h @@ -25,14 +25,14 @@ namespace DB */ template requires std::is_reference_v -To typeid_cast(From & from) +To typeid_cast(From & from) noexcept(false) { try { if ((typeid(From) == typeid(To)) || (typeid(from) == typeid(To))) return static_cast(from); } - catch (const std::exception & e) + catch (const std::bad_typeid & e) { throw DB::Exception::createDeprecated(e.what(), DB::ErrorCodes::LOGICAL_ERROR); } @@ -44,19 +44,12 @@ To typeid_cast(From & from) template requires std::is_pointer_v -To typeid_cast(From * from) +To typeid_cast(From * from) noexcept { - try - { - if ((typeid(From) == typeid(std::remove_pointer_t)) || (from && typeid(*from) == typeid(std::remove_pointer_t))) - return static_cast(from); - else - return nullptr; - } - catch (const std::exception & e) - { - throw DB::Exception::createDeprecated(e.what(), DB::ErrorCodes::LOGICAL_ERROR); - } + if ((typeid(From) == typeid(std::remove_pointer_t)) || (from && typeid(*from) == typeid(std::remove_pointer_t))) + return static_cast(from); + else + return nullptr; } namespace detail @@ -79,17 +72,10 @@ inline constexpr bool is_shared_ptr_v = is_shared_ptr::value; template requires detail::is_shared_ptr_v -To typeid_cast(const std::shared_ptr & from) +To typeid_cast(const std::shared_ptr & from) noexcept { - try - { - if ((typeid(From) == typeid(typename To::element_type)) || (from && typeid(*from) == typeid(typename To::element_type))) - return std::static_pointer_cast(from); - else - return nullptr; - } - catch (const std::exception & e) - { - throw DB::Exception::createDeprecated(e.what(), DB::ErrorCodes::LOGICAL_ERROR); - } + if ((typeid(From) == typeid(typename To::element_type)) || (from && typeid(*from) == typeid(typename To::element_type))) + return std::static_pointer_cast(from); + else + return nullptr; } From a3a12834671c95914103e59c24cf68fadc40f68f Mon Sep 17 00:00:00 2001 From: Sergey Kazmin <43613813+yerseg@users.noreply.github.com> Date: Sat, 3 Jun 2023 19:36:11 +0300 Subject: [PATCH 0375/1072] remove try-catch from the impl of typeid_cast for refs --- src/Common/typeid_cast.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/Common/typeid_cast.h b/src/Common/typeid_cast.h index e134aac09e4..f1ded97a9f1 100644 --- a/src/Common/typeid_cast.h +++ b/src/Common/typeid_cast.h @@ -27,15 +27,8 @@ template requires std::is_reference_v To typeid_cast(From & from) noexcept(false) { - try - { - if ((typeid(From) == typeid(To)) || (typeid(from) == typeid(To))) - return static_cast(from); - } - catch (const std::bad_typeid & e) - { - throw DB::Exception::createDeprecated(e.what(), DB::ErrorCodes::LOGICAL_ERROR); - } + if ((typeid(From) == typeid(To)) || (typeid(from) == typeid(To))) + return static_cast(from); throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Bad cast from type {} to {}", demangle(typeid(from).name()), demangle(typeid(To).name())); From 0f4dd26cebbcf9201124287e8d31acda92a9e9f7 Mon Sep 17 00:00:00 2001 From: alesapin Date: Sun, 4 Jun 2023 16:03:44 +0200 Subject: [PATCH 0376/1072] Add async iteration to object storage --- src/Common/CurrentMetrics.cpp | 4 + .../AzureBlobStorage/AzureObjectStorage.cpp | 69 ++++++++++++++++++ .../AzureBlobStorage/AzureObjectStorage.h | 2 + src/Disks/ObjectStorages/IObjectStorage.cpp | 9 +++ src/Disks/ObjectStorages/IObjectStorage.h | 7 ++ .../ObjectStorages/ObjectStorageIterator.cpp | 20 +++++ .../ObjectStorages/ObjectStorageIterator.h | 53 ++++++++++++++ .../ObjectStorageIteratorAsync.cpp | 64 ++++++++++++++++ .../ObjectStorageIteratorAsync.h | 58 +++++++++++++++ .../ObjectStorages/S3/S3ObjectStorage.cpp | 73 +++++++++++++++++++ src/Disks/ObjectStorages/S3/S3ObjectStorage.h | 2 + 11 files changed, 361 insertions(+) create mode 100644 src/Disks/ObjectStorages/ObjectStorageIterator.cpp create mode 100644 src/Disks/ObjectStorages/ObjectStorageIterator.h create mode 100644 src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp create mode 100644 src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 82d68ca8185..4c858ee788d 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -131,6 +131,10 @@ M(DistributedInsertThreadsActive, "Number of threads used for INSERT into Distributed running a task.") \ M(StorageS3Threads, "Number of threads in the StorageS3 thread pool.") \ M(StorageS3ThreadsActive, "Number of threads in the StorageS3 thread pool running a task.") \ + M(ObjectStorageS3Threads, "Number of threads in the S3ObjectStorage thread pool.") \ + M(ObjectStorageS3ThreadsActive, "Number of threads in the S3ObjectStorage thread pool running a task.") \ + M(ObjectStorageAzureThreads, "Number of threads in the AzureObjectStorage thread pool.") \ + M(ObjectStorageAzureThreadsActive, "Number of threads in the AzureObjectStorage thread pool running a task.") \ M(MergeTreePartsLoaderThreads, "Number of threads in the MergeTree parts loader thread pool.") \ M(MergeTreePartsLoaderThreadsActive, "Number of threads in the MergeTree parts loader thread pool running a task.") \ M(MergeTreePartsCleanerThreads, "Number of threads in the MergeTree parts cleaner thread pool.") \ diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 0358b4e915a..23a0da39dd3 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -11,9 +11,16 @@ #include #include +#include #include #include +namespace CurrentMetrics +{ + extern const Metric ObjectStorageAzureThreads; + extern const Metric ObjectStorageAzureThreadsActive; + +} namespace DB { @@ -26,6 +33,60 @@ namespace ErrorCodes } +namespace +{ + +class AzureIteratorAsync final : public IObjectStorageIteratorAsync +{ +public: + AzureIteratorAsync( + const std::string & path_prefix, + std::shared_ptr client_, + size_t max_list_size) + : IObjectStorageIteratorAsync( + CurrentMetrics::ObjectStorageAzureThreads, + CurrentMetrics::ObjectStorageAzureThreadsActive, + "ListObjectAzure") + , client(client_) + { + + options.Prefix = path_prefix; + options.PageSizeHint = static_cast(max_list_size); + } + +private: + bool getBatchAndCheckNext(RelativePathsWithMetadata & batch) override + { + auto outcome = client->ListBlobs(options); + auto blob_list_response = client->ListBlobs(options); + auto blobs_list = blob_list_response.Blobs; + + for (const auto & blob : blobs_list) + { + batch.emplace_back( + blob.Name, + ObjectMetadata{ + static_cast(blob.BlobSize), + Poco::Timestamp::fromEpochTime( + std::chrono::duration_cast( + blob.Details.LastModified.time_since_epoch()).count()), + {}}); + } + + options.ContinuationToken = blob_list_response.NextPageToken; + if (blob_list_response.HasPage()) + return true; + + return false; + } + + std::shared_ptr client; + Azure::Storage::Blobs::ListBlobsOptions options; +}; + +} + + AzureObjectStorage::AzureObjectStorage( const String & name_, AzureClientPtr && client_, @@ -67,6 +128,14 @@ bool AzureObjectStorage::exists(const StoredObject & object) const return false; } +ObjectStorageIteratorPtr AzureObjectStorage::iterate(const std::string & path_prefix) const +{ + auto settings_ptr = settings.get(); + auto client_ptr = client.get(); + + return std::make_shared(path_prefix, client_ptr, settings_ptr->list_object_keys_size); +} + void AzureObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const { auto client_ptr = client.get(); diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index a36a03bcda4..5b08ceb80e3 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -60,6 +60,8 @@ public: void listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const override; + ObjectStorageIteratorPtr iterate(const std::string & path_prefix) const override; + DataSourceDescription getDataSourceDescription() const override { return data_source_description; } std::string getName() const override { return "AzureObjectStorage"; } diff --git a/src/Disks/ObjectStorages/IObjectStorage.cpp b/src/Disks/ObjectStorages/IObjectStorage.cpp index a5903f9d429..ea22294224c 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.cpp +++ b/src/Disks/ObjectStorages/IObjectStorage.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB @@ -29,6 +30,14 @@ void IObjectStorage::listObjects(const std::string &, RelativePathsWithMetadata } +ObjectStorageIteratorPtr IObjectStorage::iterate(const std::string & path_prefix) const +{ + RelativePathsWithMetadata files; + listObjects(path_prefix, files, 0); + + return std::make_shared(std::move(files)); +} + std::optional IObjectStorage::tryGetObjectMetadata(const std::string & path) const { try diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 28de80a88cd..32f9d1ba764 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -20,6 +20,9 @@ #include #include #include +#include +#include +#include namespace DB @@ -51,6 +54,8 @@ struct RelativePathWithMetadata using RelativePathsWithMetadata = std::vector; +class IObjectStorageIterator; +using ObjectStorageIteratorPtr = std::shared_ptr; /// Base class for all object storages which implement some subset of ordinary filesystem operations. /// @@ -75,6 +80,8 @@ public: virtual void listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const; + virtual ObjectStorageIteratorPtr iterate(const std::string & path_prefix) const; + /// Get object metadata if supported. It should be possible to receive /// at least size of object virtual std::optional tryGetObjectMetadata(const std::string & path) const; diff --git a/src/Disks/ObjectStorages/ObjectStorageIterator.cpp b/src/Disks/ObjectStorages/ObjectStorageIterator.cpp new file mode 100644 index 00000000000..188b743958c --- /dev/null +++ b/src/Disks/ObjectStorages/ObjectStorageIterator.cpp @@ -0,0 +1,20 @@ +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +RelativePathWithMetadata ObjectStorageIteratorFromList::current() const +{ + if (!isValid()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to access invalid iterator"); + + return *batch_iterator; +} + +} diff --git a/src/Disks/ObjectStorages/ObjectStorageIterator.h b/src/Disks/ObjectStorages/ObjectStorageIterator.h new file mode 100644 index 00000000000..c3afd395a74 --- /dev/null +++ b/src/Disks/ObjectStorages/ObjectStorageIterator.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class IObjectStorageIterator +{ +public: + virtual void next() = 0; + virtual bool isValid() const = 0; + virtual RelativePathWithMetadata current() const = 0; + virtual size_t getAccumulatedSize() const = 0; + + virtual ~IObjectStorageIterator() = default; +}; + +using ObjectStorageIteratorPtr = std::shared_ptr; + +class ObjectStorageIteratorFromList : public IObjectStorageIterator +{ +public: + explicit ObjectStorageIteratorFromList(RelativePathsWithMetadata && batch_) + : batch(std::move(batch_)) + , batch_iterator(batch.begin()) + { + } + + void next() override + { + if (isValid()) + ++batch_iterator; + } + + bool isValid() const override + { + return batch_iterator != batch.end(); + } + + RelativePathWithMetadata current() const override; + + size_t getAccumulatedSize() const override + { + return batch.size(); + } +private: + RelativePathsWithMetadata batch; + RelativePathsWithMetadata::iterator batch_iterator; +}; + +} diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp new file mode 100644 index 00000000000..766071cf815 --- /dev/null +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp @@ -0,0 +1,64 @@ +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +void IObjectStorageIteratorAsync::next() +{ + std::lock_guard lock(mutex); + + if (current_batch_iterator != current_batch.end()) + { + ++current_batch_iterator; + } + else if (!is_finished) + { + if (outcome_future.valid()) + { + BatchAndHasNext next_batch = outcome_future.get(); + current_batch = std::move(next_batch.batch); + accumulated_size.fetch_add(current_batch.size(), std::memory_order_relaxed); + current_batch_iterator = current_batch.begin(); + if (next_batch.has_next) + outcome_future = scheduleBatch(); + else + is_finished = true; + } + } +} + +std::future IObjectStorageIteratorAsync::scheduleBatch() +{ + return list_objects_scheduler([this] + { + BatchAndHasNext result; + result.has_next = getBatchAndCheckNext(result.batch); + return result; + }, Priority{}); +} + + +bool IObjectStorageIteratorAsync::isValid() const +{ + return current_batch_iterator != current_batch.end(); +} + +RelativePathWithMetadata IObjectStorageIteratorAsync::current() const +{ + if (!isValid()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to access invalid iterator"); + + return *current_batch_iterator; +} + +size_t IObjectStorageIteratorAsync::getAccumulatedSize() const +{ + return accumulated_size.load(std::memory_order_relaxed); +} + +} diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h new file mode 100644 index 00000000000..81ba9bce137 --- /dev/null +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h @@ -0,0 +1,58 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace DB +{ + +class IObjectStorageIteratorAsync : public IObjectStorageIterator +{ +public: + IObjectStorageIteratorAsync( + CurrentMetrics::Metric threads_metric, + CurrentMetrics::Metric threads_active_metric, + const std::string & thread_name) + : list_objects_pool(threads_metric, threads_active_metric, 1) + , list_objects_scheduler(threadPoolCallbackRunner(list_objects_pool, thread_name)) + { + } + + void next() override; + bool isValid() const override; + RelativePathWithMetadata current() const override; + size_t getAccumulatedSize() const override; + + ~IObjectStorageIteratorAsync() override + { + list_objects_pool.wait(); + } + +protected: + + virtual bool getBatchAndCheckNext(RelativePathsWithMetadata & batch) = 0; + + struct BatchAndHasNext + { + RelativePathsWithMetadata batch; + bool has_next; + }; + + std::future scheduleBatch(); + + bool is_finished{false}; + + std::mutex mutex; + ThreadPool list_objects_pool; + ThreadPoolCallbackRunner list_objects_scheduler; + std::future outcome_future; + RelativePathsWithMetadata current_batch; + RelativePathsWithMetadata::iterator current_batch_iterator; + std::atomic accumulated_size = 0; +}; + + +} diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 6e63efcc1e3..d19be20f920 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -3,6 +3,7 @@ #if USE_AWS_S3 #include +#include #include #include @@ -33,6 +34,13 @@ namespace ProfileEvents extern const Event DiskS3ListObjects; } +namespace CurrentMetrics +{ + extern const Metric ObjectStorageS3Threads; + extern const Metric ObjectStorageS3ThreadsActive; +} + + namespace DB { @@ -84,6 +92,62 @@ void logIfError(const Aws::Utils::Outcome & response, std::functi } +namespace +{ + +class S3IteratorAsync final : public IObjectStorageIteratorAsync +{ +public: + S3IteratorAsync( + const std::string & bucket, + const std::string & path_prefix, + std::shared_ptr client_, + size_t max_list_size) + : IObjectStorageIteratorAsync( + CurrentMetrics::ObjectStorageS3Threads, + CurrentMetrics::ObjectStorageS3ThreadsActive, + "ListObjectS3") + , client(client_) + { + request.SetBucket(bucket); + request.SetPrefix(path_prefix); + request.SetMaxKeys(static_cast(max_list_size)); + } + +private: + bool getBatchAndCheckNext(RelativePathsWithMetadata & batch) override + { + ProfileEvents::increment(ProfileEvents::S3ListObjects); + + bool result = false; + auto outcome = client->ListObjectsV2(request); + /// Outcome failure will be handled on the caller side. + if (outcome.IsSuccess()) + { + auto objects = outcome.GetResult().GetContents(); + + result = !objects.empty(); + + for (const auto & object : objects) + batch.emplace_back(object.GetKey(), ObjectMetadata{static_cast(object.GetSize()), Poco::Timestamp::fromEpochTime(object.GetLastModified().Seconds()), {}}); + + if (result) + request.SetContinuationToken(outcome.GetResult().GetNextContinuationToken()); + + return result; + } + + throw Exception(ErrorCodes::S3_ERROR, "Could not list objects in bucket {} with prefix {}, S3 exception: {}, message: {}", + quoteString(request.GetBucket()), quoteString(request.GetPrefix()), + backQuote(outcome.GetError().GetExceptionName()), quoteString(outcome.GetError().GetMessage())); + } + + std::shared_ptr client; + S3::ListObjectsV2Request request; +}; + +} + bool S3ObjectStorage::exists(const StoredObject & object) const { auto settings_ptr = s3_settings.get(); @@ -183,6 +247,15 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN disk_write_settings); } + +ObjectStorageIteratorPtr S3ObjectStorage::iterate(const std::string & path_prefix) const +{ + auto settings_ptr = s3_settings.get(); + auto client_ptr = client.get(); + + return std::make_shared(bucket, path_prefix, client_ptr, settings_ptr->list_object_keys_size); +} + void S3ObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const { auto settings_ptr = s3_settings.get(); diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index b0eb01aec0d..072e1354d38 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -102,6 +102,8 @@ public: void listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const override; + ObjectStorageIteratorPtr iterate(const std::string & path_prefix) const override; + /// Uses `DeleteObjectRequest`. void removeObject(const StoredObject & object) override; From 4bb44c7c72417a6e7a5f2ec7e1651b4360f9956e Mon Sep 17 00:00:00 2001 From: cmsxbc Date: Sun, 4 Jun 2023 23:06:21 +0800 Subject: [PATCH 0377/1072] 1. skip extract darwin toolchain in builder when uncessary 2. update MacOSX SDK version in toolchain readme to match in builder --- cmake/toolchain/darwin-x86_64/README.txt | 4 ++-- docker/packager/binary/build.sh | 8 +++++--- docker/packager/packager | 2 ++ 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cmake/toolchain/darwin-x86_64/README.txt b/cmake/toolchain/darwin-x86_64/README.txt index 65c9aba5be6..90ada960bfa 100644 --- a/cmake/toolchain/darwin-x86_64/README.txt +++ b/cmake/toolchain/darwin-x86_64/README.txt @@ -1,2 +1,2 @@ -wget https://github.com/phracker/MacOSX-SDKs/releases/download/10.15/MacOSX10.15.sdk.tar.xz -tar xJf MacOSX10.15.sdk.tar.xz --strip-components=1 +wget https://github.com/phracker/MacOSX-SDKs/releases/download/11.3/MacOSX11.0.sdk.tar.xz +tar xJf MacOSX11.0.sdk.tar.xz --strip-components=1 diff --git a/docker/packager/binary/build.sh b/docker/packager/binary/build.sh index 2cd0a011013..ee1011a9cd5 100755 --- a/docker/packager/binary/build.sh +++ b/docker/packager/binary/build.sh @@ -11,9 +11,11 @@ ccache_status () { [ -O /build ] || git config --global --add safe.directory /build -mkdir -p /build/cmake/toolchain/darwin-x86_64 -tar xJf /MacOSX11.0.sdk.tar.xz -C /build/cmake/toolchain/darwin-x86_64 --strip-components=1 -ln -sf darwin-x86_64 /build/cmake/toolchain/darwin-aarch64 +if [ "$EXTRACT_TOOLCHAIN_DARWIN" = "1" ];then + mkdir -p /build/cmake/toolchain/darwin-x86_64 + tar xJf /MacOSX11.0.sdk.tar.xz -C /build/cmake/toolchain/darwin-x86_64 --strip-components=1 + ln -sf darwin-x86_64 /build/cmake/toolchain/darwin-aarch64 +fi # Uncomment to debug ccache. Don't put ccache log in /output right away, or it # will be confusingly packed into the "performance" package. diff --git a/docker/packager/packager b/docker/packager/packager index a894fe2d8e9..1b3df858cd2 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -167,6 +167,7 @@ def parse_env_variables( cmake_flags.append( "-DCMAKE_TOOLCHAIN_FILE=/build/cmake/darwin/toolchain-x86_64.cmake" ) + result.append("EXTRACT_TOOLCHAIN_DARWIN=1") elif is_cross_darwin_arm: cc = compiler[: -len(DARWIN_ARM_SUFFIX)] cmake_flags.append("-DCMAKE_AR:FILEPATH=/cctools/bin/aarch64-apple-darwin-ar") @@ -181,6 +182,7 @@ def parse_env_variables( cmake_flags.append( "-DCMAKE_TOOLCHAIN_FILE=/build/cmake/darwin/toolchain-aarch64.cmake" ) + result.append("EXTRACT_TOOLCHAIN_DARWIN=1") elif is_cross_arm: cc = compiler[: -len(ARM_SUFFIX)] cmake_flags.append( From 8d8d06285ff6d931d6a1ee069c9a5885dc09264a Mon Sep 17 00:00:00 2001 From: alesapin Date: Sun, 4 Jun 2023 18:59:22 +0200 Subject: [PATCH 0378/1072] Add integration test --- src/Storages/StorageAzure.cpp | 51 +++++++- .../__init__.py | 1 + .../test_storage_azure_blob_storage/test.py | 122 ++++++++++++++++++ 3 files changed, 170 insertions(+), 4 deletions(-) create mode 100644 tests/integration/test_storage_azure_blob_storage/__init__.py create mode 100644 tests/integration/test_storage_azure_blob_storage/test.py diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index fd250a128c9..7b4bc9e6769 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -115,6 +115,7 @@ StorageAzure::Configuration StorageAzure::getConfiguration(ASTs & engine_args, C else { configuration.account_name = fourth_arg; + configuration.account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name"); if (!is_format_arg(sixth_arg)) @@ -216,18 +217,58 @@ AzureClientPtr StorageAzure::createClient(StorageAzure::Configuration configurat if (configuration.is_connection_string) { result = std::make_unique(BlobContainerClient::CreateFromConnectionString(configuration.connection_url, configuration.container)); + result->CreateIfNotExists(); } else { if (configuration.account_name.has_value() && configuration.account_key.has_value()) { auto storage_shared_key_credential = std::make_shared(*configuration.account_name, *configuration.account_key); - result = std::make_unique(configuration.connection_url, storage_shared_key_credential); + auto blob_service_client = std::make_unique(configuration.connection_url, storage_shared_key_credential); + try + { + result = std::make_unique(blob_service_client->CreateBlobContainer(configuration.container).Value); + } + catch (const Azure::Storage::StorageException & e) + { + if (e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict) + { + auto final_url = configuration.connection_url + + (configuration.connection_url.back() == '/' ? "" : "/") + + configuration.container; + + result = std::make_unique(final_url, storage_shared_key_credential); + } + else + { + throw; + } + } } + else + { + auto managed_identity_credential = std::make_shared(); + auto blob_service_client = std::make_unique(configuration.connection_url, managed_identity_credential); + try + { + result = std::make_unique(blob_service_client->CreateBlobContainer(configuration.container).Value); + } + catch (const Azure::Storage::StorageException & e) + { + if (e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict) + { + auto final_url = configuration.connection_url + + (configuration.connection_url.back() == '/' ? "" : "/") + + configuration.container; - auto managed_identity_credential = std::make_shared(); - - result = std::make_unique(configuration.connection_url, managed_identity_credential); + result = std::make_unique(final_url, managed_identity_credential); + } + else + { + throw; + } + } + } } return result; @@ -466,12 +507,14 @@ SinkToStoragePtr StorageAzure::write(const ASTPtr & query, const StorageMetadata if (!truncate_in_insert && object_storage->exists(StoredObject(configuration.blob_path))) { + if (local_context->getSettingsRef().s3_create_new_file_on_insert) { size_t index = configuration.blobs_paths.size(); const auto & first_key = configuration.blobs_paths[0]; auto pos = first_key.find_first_of('.'); String new_key; + do { new_key = first_key.substr(0, pos) + "." + std::to_string(index) + (pos == std::string::npos ? "" : first_key.substr(pos)); diff --git a/tests/integration/test_storage_azure_blob_storage/__init__.py b/tests/integration/test_storage_azure_blob_storage/__init__.py new file mode 100644 index 00000000000..e5a0d9b4834 --- /dev/null +++ b/tests/integration/test_storage_azure_blob_storage/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py new file mode 100644 index 00000000000..94b059fe4fe --- /dev/null +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 + +import gzip +import json +import logging +import os +import io +import random +import threading +import time + +from azure.storage.blob import BlobServiceClient +import helpers.client +import pytest +from helpers.cluster import ClickHouseCluster, ClickHouseInstance +from helpers.network import PartitionManager +from helpers.mock_servers import start_mock_servers +from helpers.test_tools import exec_query_with_retry + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance( + "node", + with_azurite=True, + ) + cluster.start() + + yield cluster + finally: + cluster.shutdown() + +def azure_query(node, query, try_num=3, settings={}): + for i in range(try_num): + try: + return node.query(query, settings=settings) + except Exception as ex: + retriable_errors = [ + "DB::Exception: Azure::Core::Http::TransportException: Connection was closed by the server while trying to read a response" + ] + retry = False + for error in retriable_errors: + if error in str(ex): + retry = True + print(f"Try num: {i}. Having retriable error: {ex}") + break + if not retry or i == try_num - 1: + raise Exception(ex) + continue + +def get_azure_file_content(filename): + container_name = "cont" + connection_string = "DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;" + blob_service_client = BlobServiceClient.from_connection_string(connection_string) + container_client = blob_service_client.get_container_client(container_name) + blob_client = container_client.get_blob_client(filename) + download_stream = blob_client.download_blob() + return download_stream.readall().decode('utf-8') + +def test_create_table_connection_string(cluster): + node = cluster.instances["node"] + azure_query(node, "CREATE TABLE test_create_table_conn_string (key UInt64, data String) Engine = Azure('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1/;', 'cont', 'test_create_connection_string', 'CSV')") + +def test_create_table_account_string(cluster): + node = cluster.instances["node"] + azure_query(node, "CREATE TABLE test_create_table_account_url (key UInt64, data String) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', 'test_create_connection_string', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV')") + +def test_simple_write_account_string(cluster): + node = cluster.instances["node"] + azure_query(node, "CREATE TABLE test_simple_write (key UInt64, data String) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', 'test_simple_write.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV')") + azure_query(node, "INSERT INTO test_simple_write VALUES (1, 'a')") + print(get_azure_file_content('test_simple_write.csv')) + assert get_azure_file_content('test_simple_write.csv') == '1,"a"\n' + +def test_simple_write_connection_string(cluster): + node = cluster.instances["node"] + azure_query(node, "CREATE TABLE test_simple_write_connection_string (key UInt64, data String) Engine = Azure('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1;', 'cont', 'test_simple_write_c.csv', 'CSV')") + azure_query(node, "INSERT INTO test_simple_write_connection_string VALUES (1, 'a')") + print(get_azure_file_content('test_simple_write_c.csv')) + assert get_azure_file_content('test_simple_write_c.csv') == '1,"a"\n' + + +def test_partition_by(cluster): + node = cluster.instances["node"] + table_format = "column1 UInt32, column2 UInt32, column3 UInt32" + partition_by = "column3" + values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)" + filename = "test_{_partition_id}.csv" + + azure_query(node, f"CREATE TABLE test_partitioned_write ({table_format}) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV') PARTITION BY {partition_by}") + azure_query(node, f"INSERT INTO test_partitioned_write VALUES {values}") + + assert "1,2,3\n" == get_azure_file_content("test_3.csv") + assert "3,2,1\n" == get_azure_file_content("test_1.csv") + assert "78,43,45\n" == get_azure_file_content("test_45.csv") + + +def test_partition_by_string_column(cluster): + node = cluster.instances["node"] + table_format = "col_num UInt32, col_str String" + partition_by = "col_str" + values = "(1, 'foo/bar'), (3, 'йцук'), (78, '你好')" + filename = "test_{_partition_id}.csv" + azure_query(node, f"CREATE TABLE test_partitioned_string_write ({table_format}) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV') PARTITION BY {partition_by}") + azure_query(node, f"INSERT INTO test_partitioned_string_write VALUES {values}") + + assert '1,"foo/bar"\n' == get_azure_file_content("test_foo/bar.csv") + assert '3,"йцук"\n' == get_azure_file_content("test_йцук.csv") + assert '78,"你好"\n' == get_azure_file_content("test_你好.csv") + + +def test_partition_by_const_column(cluster): + node = cluster.instances["node"] + table_format = "column1 UInt32, column2 UInt32, column3 UInt32" + values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)" + partition_by = "'88'" + values_csv = "1,2,3\n3,2,1\n78,43,45\n" + filename = "test_{_partition_id}.csv" + azure_query(node, f"CREATE TABLE test_partitioned_const_write ({table_format}) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV') PARTITION BY {partition_by}") + azure_query(node, f"INSERT INTO test_partitioned_const_write VALUES {values}") + assert values_csv == get_azure_file_content("test_88.csv") From a66f68e5df5584cf08a42ff6dd12b4e935f2cb3a Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Mon, 5 Jun 2023 01:48:13 +0800 Subject: [PATCH 0379/1072] fix again --- src/Interpreters/MutationsInterpreter.cpp | 58 ++++++++++++----------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 25bb3fc5e82..25c52ad8925 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -38,6 +38,7 @@ #include #include #include +#include namespace DB @@ -804,10 +805,10 @@ void MutationsInterpreter::prepare(bool dry_run) /// We care about affected indices and projections because we also need to rewrite them /// when one of index columns updated or filtered with delete. /// The same about columns, that are needed for calculation of TTL expressions. + NameSet changed_columns; + NameSet unchanged_columns; if (!dependencies.empty()) { - NameSet changed_columns; - NameSet unchanged_columns; for (const auto & dependency : dependencies) { if (dependency.isReadOnly()) @@ -826,31 +827,6 @@ void MutationsInterpreter::prepare(bool dry_run) for (const auto & column : changed_columns) stages.back().column_to_updated.emplace( column, std::make_shared(column)); - - for (const auto & index : metadata_snapshot->getSecondaryIndices()) - { - if (source.hasIndexOrProjection("skp_idx_" + index.name + ".idx") - || source.hasIndexOrProjection("skp_idx_" + index.name + ".idx2")) - { - const auto & index_cols = index.expression->getRequiredColumns(); - bool changed = std::any_of( - index_cols.begin(), index_cols.end(), [&](const auto & col) { return changed_columns.contains(col); }); - if (changed) - materialized_indices.insert(index.name); - } - } - - for (const auto & projection : metadata_snapshot->getProjections()) - { - if (source.hasIndexOrProjection(projection.getDirectoryName())) - { - const auto & projection_cols = projection.required_columns; - bool changed = std::any_of( - projection_cols.begin(), projection_cols.end(), [&](const auto & col) { return changed_columns.contains(col); }); - if (changed) - materialized_projections.insert(projection.name); - } - } } if (!unchanged_columns.empty()) @@ -883,6 +859,34 @@ void MutationsInterpreter::prepare(bool dry_run) } } + for (const auto & index : metadata_snapshot->getSecondaryIndices()) + { + if (source.hasIndexOrProjection("skp_idx_" + index.name + ".idx") || source.hasIndexOrProjection("skp_idx_" + index.name + ".idx2")) + { + const auto & index_cols = index.expression->getRequiredColumns(); + bool changed = std::any_of( + index_cols.begin(), + index_cols.end(), + [&](const auto & col) { return updated_columns.contains(col) || changed_columns.contains(col); }); + if (changed) + materialized_indices.insert(index.name); + } + } + + for (const auto & projection : metadata_snapshot->getProjections()) + { + if (source.hasIndexOrProjection(projection.getDirectoryName())) + { + const auto & projection_cols = projection.required_columns; + bool changed = std::any_of( + projection_cols.begin(), + projection_cols.end(), + [&](const auto & col) { return updated_columns.contains(col) || changed_columns.contains(col); }); + if (changed) + materialized_projections.insert(projection.name); + } + } + /// Stages might be empty when we materialize skip indices or projections which don't add any /// column dependencies. if (stages.empty()) From 2cc457141ed83a50c7a6e4dc395325c6fd4a898d Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 15:32:54 -0300 Subject: [PATCH 0380/1072] clean documentation of ip4 ip6 from domains --- docs/en/interfaces/formats.md | 34 +++++++++---------- docs/en/operations/system-tables/query_log.md | 4 +-- .../system-tables/query_thread_log.md | 4 +-- .../operations/system-tables/session_log.md | 2 +- .../operations/system-tables/zookeeper_log.md | 2 +- docs/en/sql-reference/data-types/index.md | 2 +- .../data-types/{domains => }/ipv4.md | 27 +++------------ .../data-types/{domains => }/ipv6.md | 29 +++------------- .../functions/ip-address-functions.md | 6 ++-- docs/redirects.txt | 10 +++--- 10 files changed, 41 insertions(+), 79 deletions(-) rename docs/en/sql-reference/data-types/{domains => }/ipv4.md (60%) rename docs/en/sql-reference/data-types/{domains => }/ipv6.md (61%) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 2ab9e8caec4..d75fb32b571 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -1292,8 +1292,8 @@ For output it uses the following correspondence between ClickHouse types and BSO | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `\x04` array | | [Named Tuple](/docs/en/sql-reference/data-types/tuple.md) | `\x03` document | | [Map](/docs/en/sql-reference/data-types/map.md) | `\x03` document | -| [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `\x10` int32 | -| [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `\x05` binary, `\x00` binary subtype | +| [IPv4](/docs/en/sql-reference/data-types/ipv4.md) | `\x10` int32 | +| [IPv6](/docs/en/sql-reference/data-types/ipv6.md) | `\x05` binary, `\x00` binary subtype | For input it uses the following correspondence between BSON types and ClickHouse types: @@ -1303,7 +1303,7 @@ For input it uses the following correspondence between BSON types and ClickHouse | `\x02` string | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | | `\x03` document | [Map](/docs/en/sql-reference/data-types/map.md)/[Named Tuple](/docs/en/sql-reference/data-types/tuple.md) | | `\x04` array | [Array](/docs/en/sql-reference/data-types/array.md)/[Tuple](/docs/en/sql-reference/data-types/tuple.md) | -| `\x05` binary, `\x00` binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md)/[IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | +| `\x05` binary, `\x00` binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md)/[IPv6](/docs/en/sql-reference/data-types/ipv6.md) | | `\x05` binary, `\x02` old binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | | `\x05` binary, `\x03` old uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) | | `\x05` binary, `\x04` uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) | @@ -1313,7 +1313,7 @@ For input it uses the following correspondence between BSON types and ClickHouse | `\x0A` null value | [NULL](/docs/en/sql-reference/data-types/nullable.md) | | `\x0D` JavaScript code | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | | `\x0E` symbol | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | -| `\x10` int32 | [Int32/UInt32](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal32](/docs/en/sql-reference/data-types/decimal.md)/[IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md)/[Enum8/Enum16](/docs/en/sql-reference/data-types/enum.md) | +| `\x10` int32 | [Int32/UInt32](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal32](/docs/en/sql-reference/data-types/decimal.md)/[IPv4](/docs/en/sql-reference/data-types/ipv4.md)/[Enum8/Enum16](/docs/en/sql-reference/data-types/enum.md) | | `\x12` int64 | [Int64/UInt64](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal64](/docs/en/sql-reference/data-types/decimal.md)/[DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | Other BSON types are not supported. Also, it performs conversion between different integer types (for example, you can insert BSON int32 value into ClickHouse UInt8). @@ -1663,8 +1663,8 @@ The table below shows supported data types and how they match ClickHouse [data t | `ENUM` | [Enum(8/16)](/docs/en/sql-reference/data-types/enum.md) | `ENUM` | | `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` | | `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` | -| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` | -| `DATA` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `DATA` | +| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/ipv4.md) | `UINT32` | +| `DATA` | [IPv6](/docs/en/sql-reference/data-types/ipv6.md) | `DATA` | | `DATA` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `DATA` | | `DATA` | [Decimal128/Decimal256](/docs/en/sql-reference/data-types/decimal.md) | `DATA` | | `STRUCT(entries LIST(STRUCT(key Key, value Value)))` | [Map](/docs/en/sql-reference/data-types/map.md) | `STRUCT(entries LIST(STRUCT(key Key, value Value)))` | @@ -1866,8 +1866,8 @@ The table below shows supported data types and how they match ClickHouse [data t | `long (timestamp-millis)` \** | [DateTime64(3)](/docs/en/sql-reference/data-types/datetime.md) | `long (timestamp-millis)` \** | | `long (timestamp-micros)` \** | [DateTime64(6)](/docs/en/sql-reference/data-types/datetime.md) | `long (timestamp-micros)` \** | | `bytes (decimal)` \** | [DateTime64(N)](/docs/en/sql-reference/data-types/datetime.md) | `bytes (decimal)` \** | -| `int` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `int` | -| `fixed(16)` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `fixed(16)` | +| `int` | [IPv4](/docs/en/sql-reference/data-types/ipv4.md) | `int` | +| `fixed(16)` | [IPv6](/docs/en/sql-reference/data-types/ipv6.md) | `fixed(16)` | | `bytes (decimal)` \** | [Decimal(P, S)](/docs/en/sql-reference/data-types/decimal.md) | `bytes (decimal)` \** | | `string (uuid)` \** | [UUID](/docs/en/sql-reference/data-types/uuid.md) | `string (uuid)` \** | | `fixed(16)` | [Int128/UInt128](/docs/en/sql-reference/data-types/int-uint.md) | `fixed(16)` | @@ -2001,9 +2001,9 @@ The table below shows supported data types and how they match ClickHouse [data t | `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` | | `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` | | `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` | -| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` | -| `FIXED_LENGTH_BYTE_ARRAY`, `BINARY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_LENGTH_BYTE_ARRAY` | -| `FIXED_LENGTH_BYTE_ARRAY`, `BINARY` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `FIXED_LENGTH_BYTE_ARRAY` | +| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/ipv4.md) | `UINT32` | +| `FIXED_LENGTH_BYTE_ARRAY`, `BINARY` | [IPv6](/docs/en/sql-reference/data-types/ipv6.md) | `FIXED_LENGTH_BYTE_ARRAY` | +| `FIXED_LENGTH_BYTE_ARRAY`, `BINARY` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `FIXED_LENGTH_BYTE_ARRAY` | Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested. @@ -2057,7 +2057,7 @@ Special format for reading Parquet file metadata (https://parquet.apache.org/doc - logical_type - column logical type - compression - compression used for this column - total_uncompressed_size - total uncompressed bytes size of the column, calculated as the sum of total_uncompressed_size of the column from all row groups - - total_compressed_size - total compressed bytes size of the column, calculated as the sum of total_compressed_size of the column from all row groups + - total_compressed_size - total compressed bytes size of the column, calculated as the sum of total_compressed_size of the column from all row groups - space_saved - percent of space saved by compression, calculated as (1 - total_compressed_size/total_uncompressed_size). - encodings - the list of encodings used for this column - row_groups - the list of row groups metadata with the next structure: @@ -2204,9 +2204,9 @@ The table below shows supported data types and how they match ClickHouse [data t | `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` | | `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` | | `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` | -| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` | -| `FIXED_SIZE_BINARY`, `BINARY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_SIZE_BINARY` | -| `FIXED_SIZE_BINARY`, `BINARY` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `FIXED_SIZE_BINARY` | +| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/ipv4.md) | `UINT32` | +| `FIXED_SIZE_BINARY`, `BINARY` | [IPv6](/docs/en/sql-reference/data-types/ipv6.md) | `FIXED_SIZE_BINARY` | +| `FIXED_SIZE_BINARY`, `BINARY` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `FIXED_SIZE_BINARY` | Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested. @@ -2272,7 +2272,7 @@ The table below shows supported data types and how they match ClickHouse [data t | `Struct` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `Struct` | | `Map` | [Map](/docs/en/sql-reference/data-types/map.md) | `Map` | | `Int` | [IPv4](/docs/en/sql-reference/data-types/int-uint.md) | `Int` | -| `Binary` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `Binary` | +| `Binary` | [IPv6](/docs/en/sql-reference/data-types/ipv6.md) | `Binary` | | `Binary` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `Binary` | | `Binary` | [Decimal256](/docs/en/sql-reference/data-types/decimal.md) | `Binary` | @@ -2485,7 +2485,7 @@ ClickHouse supports reading and writing [MessagePack](https://msgpack.org/) data | `uint 64` | [DateTime64](/docs/en/sql-reference/data-types/datetime.md) | `uint 64` | | `fixarray`, `array 16`, `array 32` | [Array](/docs/en/sql-reference/data-types/array.md)/[Tuple](/docs/en/sql-reference/data-types/tuple.md) | `fixarray`, `array 16`, `array 32` | | `fixmap`, `map 16`, `map 32` | [Map](/docs/en/sql-reference/data-types/map.md) | `fixmap`, `map 16`, `map 32` | -| `uint 32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `uint 32` | +| `uint 32` | [IPv4](/docs/en/sql-reference/data-types/ipv4.md) | `uint 32` | | `bin 8` | [String](/docs/en/sql-reference/data-types/string.md) | `bin 8` | | `int 8` | [Enum8](/docs/en/sql-reference/data-types/enum.md) | `int 8` | | `bin 8` | [(U)Int128/(U)Int256](/docs/en/sql-reference/data-types/int-uint.md) | `bin 8` | diff --git a/docs/en/operations/system-tables/query_log.md b/docs/en/operations/system-tables/query_log.md index 71e1452cef1..b9fdd19c643 100644 --- a/docs/en/operations/system-tables/query_log.md +++ b/docs/en/operations/system-tables/query_log.md @@ -71,11 +71,11 @@ Columns: - 0 — Query was initiated by another query as part of distributed query execution. - `user` ([String](../../sql-reference/data-types/string.md)) — Name of the user who initiated the current query. - `query_id` ([String](../../sql-reference/data-types/string.md)) — ID of the query. -- `address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP address that was used to make the query. +- `address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — IP address that was used to make the query. - `port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — The client port that was used to make the query. - `initial_user` ([String](../../sql-reference/data-types/string.md)) — Name of the user who ran the initial query (for distributed query execution). - `initial_query_id` ([String](../../sql-reference/data-types/string.md)) — ID of the initial query (for distributed query execution). -- `initial_address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP address that the parent query was launched from. +- `initial_address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — IP address that the parent query was launched from. - `initial_port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — The client port that was used to make the parent query. - `initial_query_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Initial query starting time (for distributed query execution). - `initial_query_start_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Initial query starting time with microseconds precision (for distributed query execution). diff --git a/docs/en/operations/system-tables/query_thread_log.md b/docs/en/operations/system-tables/query_thread_log.md index cdd23bb15db..a6d5632ade9 100644 --- a/docs/en/operations/system-tables/query_thread_log.md +++ b/docs/en/operations/system-tables/query_thread_log.md @@ -40,11 +40,11 @@ Columns: - 0 — Query was initiated by another query for distributed query execution. - `user` ([String](../../sql-reference/data-types/string.md)) — Name of the user who initiated the current query. - `query_id` ([String](../../sql-reference/data-types/string.md)) — ID of the query. -- `address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP address that was used to make the query. +- `address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — IP address that was used to make the query. - `port` ([UInt16](../../sql-reference/data-types/int-uint.md#uint-ranges)) — The client port that was used to make the query. - `initial_user` ([String](../../sql-reference/data-types/string.md)) — Name of the user who ran the initial query (for distributed query execution). - `initial_query_id` ([String](../../sql-reference/data-types/string.md)) — ID of the initial query (for distributed query execution). -- `initial_address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP address that the parent query was launched from. +- `initial_address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — IP address that the parent query was launched from. - `initial_port` ([UInt16](../../sql-reference/data-types/int-uint.md#uint-ranges)) — The client port that was used to make the parent query. - `interface` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Interface that the query was initiated from. Possible values: - 1 — TCP. diff --git a/docs/en/operations/system-tables/session_log.md b/docs/en/operations/system-tables/session_log.md index 661d34677e4..5b1a2b2a489 100644 --- a/docs/en/operations/system-tables/session_log.md +++ b/docs/en/operations/system-tables/session_log.md @@ -28,7 +28,7 @@ Columns: - `profiles` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — The list of profiles set for all roles and/or users. - `roles` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — The list of roles to which the profile is applied. - `settings` ([Array](../../sql-reference/data-types/array.md)([Tuple](../../sql-reference/data-types/tuple.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md), [String](../../sql-reference/data-types/string.md)))) — Settings that were changed when the client logged in/out. -- `client_address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — The IP address that was used to log in/out. +- `client_address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — The IP address that was used to log in/out. - `client_port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — The client port that was used to log in/out. - `interface` ([Enum8](../../sql-reference/data-types/enum.md)) — The interface from which the login was initiated. Possible values: - `TCP` diff --git a/docs/en/operations/system-tables/zookeeper_log.md b/docs/en/operations/system-tables/zookeeper_log.md index b7cc4e22cd6..dce5be29f62 100644 --- a/docs/en/operations/system-tables/zookeeper_log.md +++ b/docs/en/operations/system-tables/zookeeper_log.md @@ -15,7 +15,7 @@ Columns with request parameters: - `Finalize` — The connection is lost, no response was received. - `event_date` ([Date](../../sql-reference/data-types/date.md)) — The date when the event happened. - `event_time` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — The date and time when the event happened. -- `address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP address of ZooKeeper server that was used to make the request. +- `address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — IP address of ZooKeeper server that was used to make the request. - `port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — The port of ZooKeeper server that was used to make the request. - `session_id` ([Int64](../../sql-reference/data-types/int-uint.md)) — The session ID that the ZooKeeper server sets for each connection. - `xid` ([Int32](../../sql-reference/data-types/int-uint.md)) — The ID of the request within the session. This is usually a sequential request number. It is the same for the request row and the paired `response`/`finalize` row. diff --git a/docs/en/sql-reference/data-types/index.md b/docs/en/sql-reference/data-types/index.md index 508307a0543..ffd063590fa 100644 --- a/docs/en/sql-reference/data-types/index.md +++ b/docs/en/sql-reference/data-types/index.md @@ -28,6 +28,6 @@ ClickHouse data types include: - **Nested data structures**: A [`Nested` data structure](./nested-data-structures/index.md) is like a table inside a cell - **Tuples**: A [`Tuple` of elements](./tuple.md), each having an individual type. - **Nullable**: [`Nullable`](./nullable.md) allows you to store a value as `NULL` when a value is "missing" (instead of the column settings its default value for the data type) -- **IP addresses**: use [`IPv4`](./domains/ipv4.md) and [`IPv6`](./domains/ipv6.md) to efficiently store IP addresses +- **IP addresses**: use [`IPv4`](./ipv4.md) and [`IPv6`](./ipv6.md) to efficiently store IP addresses - **Geo types**: for [geographical data](./geo.md), including `Point`, `Ring`, `Polygon` and `MultiPolygon` - **Special data types**: including [`Expression`](./special-data-types/expression.md), [`Set`](./special-data-types/set.md), [`Nothing`](./special-data-types/nothing.md) and [`Interval`](./special-data-types/interval.md) diff --git a/docs/en/sql-reference/data-types/domains/ipv4.md b/docs/en/sql-reference/data-types/ipv4.md similarity index 60% rename from docs/en/sql-reference/data-types/domains/ipv4.md rename to docs/en/sql-reference/data-types/ipv4.md index b34814211fc..288806f47b3 100644 --- a/docs/en/sql-reference/data-types/domains/ipv4.md +++ b/docs/en/sql-reference/data-types/ipv4.md @@ -1,12 +1,12 @@ --- -slug: /en/sql-reference/data-types/domains/ipv4 +slug: /en/sql-reference/data-types/ipv4 sidebar_position: 59 sidebar_label: IPv4 --- ## IPv4 -`IPv4` is a domain based on `UInt32` type and serves as a typed replacement for storing IPv4 values. It provides compact storage with the human-friendly input-output format and column type information on inspection. +IPv4 addresses. Stored in 4 bytes as UInt32. ### Basic Usage @@ -57,25 +57,6 @@ SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; └──────────────────┴───────────┘ ``` -Domain values are not implicitly convertible to types other than `UInt32`. -If you want to convert `IPv4` value to a string, you have to do that explicitly with `IPv4NumToString()` function: +**See Also** -``` sql -SELECT toTypeName(s), IPv4NumToString(from) as s FROM hits LIMIT 1; -``` - - ┌─toTypeName(IPv4NumToString(from))─┬─s──────────────┐ - │ String │ 183.247.232.58 │ - └───────────────────────────────────┴────────────────┘ - -Or cast to a `UInt32` value: - -``` sql -SELECT toTypeName(i), CAST(from as UInt32) as i FROM hits LIMIT 1; -``` - -``` text -┌─toTypeName(CAST(from, 'UInt32'))─┬──────────i─┐ -│ UInt32 │ 3086477370 │ -└──────────────────────────────────┴────────────┘ -``` +- [Functions for Working with IPv4 and IPv6 Addresses](../functions/ip-address-functions.md) diff --git a/docs/en/sql-reference/data-types/domains/ipv6.md b/docs/en/sql-reference/data-types/ipv6.md similarity index 61% rename from docs/en/sql-reference/data-types/domains/ipv6.md rename to docs/en/sql-reference/data-types/ipv6.md index dcb22e3cb6d..284a1f80854 100644 --- a/docs/en/sql-reference/data-types/domains/ipv6.md +++ b/docs/en/sql-reference/data-types/ipv6.md @@ -1,12 +1,12 @@ --- -slug: /en/sql-reference/data-types/domains/ipv6 +slug: /en/sql-reference/data-types/ipv6 sidebar_position: 60 sidebar_label: IPv6 --- ## IPv6 -`IPv6` is a domain based on `FixedString(16)` type and serves as a typed replacement for storing IPv6 values. It provides compact storage with the human-friendly input-output format and column type information on inspection. +IPv6 addresses. Stored in 16 bytes as UInt128. ### Basic Usage @@ -57,27 +57,6 @@ SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; └──────────────────┴──────────────────────────────────┘ ``` -Domain values are not implicitly convertible to types other than `FixedString(16)`. -If you want to convert `IPv6` value to a string, you have to do that explicitly with `IPv6NumToString()` function: +**See Also** -``` sql -SELECT toTypeName(s), IPv6NumToString(from) as s FROM hits LIMIT 1; -``` - -``` text -┌─toTypeName(IPv6NumToString(from))─┬─s─────────────────────────────┐ -│ String │ 2001:44c8:129:2632:33:0:252:2 │ -└───────────────────────────────────┴───────────────────────────────┘ -``` - -Or cast to a `FixedString(16)` value: - -``` sql -SELECT toTypeName(i), CAST(from as FixedString(16)) as i FROM hits LIMIT 1; -``` - -``` text -┌─toTypeName(CAST(from, 'FixedString(16)'))─┬─i───────┐ -│ FixedString(16) │ ��� │ -└───────────────────────────────────────────┴─────────┘ -``` +- [Functions for Working with IPv4 and IPv6 Addresses](../functions/ip-address-functions.md) diff --git a/docs/en/sql-reference/functions/ip-address-functions.md b/docs/en/sql-reference/functions/ip-address-functions.md index 0dc1db1161b..33c788a632e 100644 --- a/docs/en/sql-reference/functions/ip-address-functions.md +++ b/docs/en/sql-reference/functions/ip-address-functions.md @@ -248,7 +248,7 @@ SELECT IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32); ## toIPv4(string) -An alias to `IPv4StringToNum()` that takes a string form of IPv4 address and returns value of [IPv4](../../sql-reference/data-types/domains/ipv4.md) type, which is binary equal to value returned by `IPv4StringToNum()`. +An alias to `IPv4StringToNum()` that takes a string form of IPv4 address and returns value of [IPv4](../../sql-reference/data-types/ipv4.md) type, which is binary equal to value returned by `IPv4StringToNum()`. ``` sql WITH @@ -296,7 +296,7 @@ Same as `toIPv6`, but if the IPv6 address has an invalid format, it returns null ## toIPv6 -Converts a string form of IPv6 address to [IPv6](../../sql-reference/data-types/domains/ipv6.md) type. If the IPv6 address has an invalid format, returns an empty value. +Converts a string form of IPv6 address to [IPv6](../../sql-reference/data-types/ipv6.md) type. If the IPv6 address has an invalid format, returns an empty value. Similar to [IPv6StringToNum](#ipv6stringtonums) function, which converts IPv6 address to binary format. If the input string contains a valid IPv4 address, then the IPv6 equivalent of the IPv4 address is returned. @@ -315,7 +315,7 @@ toIPv6(string) - IP address. -Type: [IPv6](../../sql-reference/data-types/domains/ipv6.md). +Type: [IPv6](../../sql-reference/data-types/ipv6.md). **Examples** diff --git a/docs/redirects.txt b/docs/redirects.txt index cea138f7237..3abc8df2b7f 100644 --- a/docs/redirects.txt +++ b/docs/redirects.txt @@ -14,7 +14,7 @@ data_types/datetime.md sql-reference/data-types/datetime.md data_types/datetime64.md sql-reference/data-types/datetime64.md data_types/decimal.md sql-reference/data-types/decimal.md data_types/domains/ipv4.md sql-reference/data-types/domains/ipv4.md -data_types/domains/ipv6.md sql-reference/data-types/domains/ipv6.md +data_types/domains/ipv6.md sql-reference/data-types/ipv6.md data_types/domains/overview.md sql-reference/data-types/domains/overview.md data_types/enum.md sql-reference/data-types/enum.md data_types/fixedstring.md sql-reference/data-types/fixedstring.md @@ -162,7 +162,7 @@ interfaces/third-party/client_libraries.md interfaces/third-party/client-librari interfaces/third-party_client_libraries.md interfaces/third-party/client-libraries.md interfaces/third-party_gui.md interfaces/third-party/gui.md interfaces/third_party/index.md interfaces/third-party/index.md -introduction/index.md +introduction/index.md introduction/distinctive_features.md introduction/distinctive-features.md introduction/features_considered_disadvantages.md introduction/distinctive-features.md introduction/possible_silly_questions.md faq/general.md @@ -305,8 +305,10 @@ sql_reference/data_types/datetime.md sql-reference/data-types/datetime.md sql_reference/data_types/datetime64.md sql-reference/data-types/datetime64.md sql_reference/data_types/decimal.md sql-reference/data-types/decimal.md sql_reference/data_types/domains/index.md sql-reference/data-types/domains/index.md -sql_reference/data_types/domains/ipv4.md sql-reference/data-types/domains/ipv4.md -sql_reference/data_types/domains/ipv6.md sql-reference/data-types/domains/ipv6.md +sql_reference/data_types/domains/ipv4.md sql-reference/data-types/ipv4.md +sql_reference/data-types/domains/ipv4.md sql-reference/data-types/ipv4.md +sql_reference/data_types/domains/ipv6.md sql-reference/data-types/ipv6.md +sql_reference/data-types/domains/ipv6.md sql-reference/data-types/ipv6.md sql_reference/data_types/domains/overview.md sql-reference/data-types/domains/overview.md sql_reference/data_types/enum.md sql-reference/data-types/enum.md sql_reference/data_types/fixedstring.md sql-reference/data-types/fixedstring.md From 2923d57757cd28dde82d8edd762c1912129a68e4 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 15:37:32 -0300 Subject: [PATCH 0381/1072] Update redirects.txt --- docs/redirects.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/redirects.txt b/docs/redirects.txt index 3abc8df2b7f..98d6f6b8f7c 100644 --- a/docs/redirects.txt +++ b/docs/redirects.txt @@ -13,7 +13,7 @@ data_types/date.md sql-reference/data-types/date.md data_types/datetime.md sql-reference/data-types/datetime.md data_types/datetime64.md sql-reference/data-types/datetime64.md data_types/decimal.md sql-reference/data-types/decimal.md -data_types/domains/ipv4.md sql-reference/data-types/domains/ipv4.md +data_types/domains/ipv4.md sql-reference/data-types/ipv4.md data_types/domains/ipv6.md sql-reference/data-types/ipv6.md data_types/domains/overview.md sql-reference/data-types/domains/overview.md data_types/enum.md sql-reference/data-types/enum.md From 6fe0aa531e28c27287fb5b6f57536ede0016b20a Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 16:18:53 -0300 Subject: [PATCH 0382/1072] Update summap.md --- .../aggregate-functions/reference/summap.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/summap.md b/docs/en/sql-reference/aggregate-functions/reference/summap.md index 1acfde3783a..d63e8b81716 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/summap.md +++ b/docs/en/sql-reference/aggregate-functions/reference/summap.md @@ -5,7 +5,9 @@ sidebar_position: 141 # sumMap -Syntax: `sumMap(key, value)` or `sumMap(Tuple(key, value))` +Syntax: [sumMap(key , value )](../../data-types/array.md) or [sumMap(Tuple(key , value ))](../../data-types/tuple.md) + +Alias: `sumMappedArrays`. Totals the `value` array according to the keys specified in the `key` array. @@ -27,6 +29,7 @@ CREATE TABLE sum_map( ), statusMapTuple Tuple(Array(Int32), Array(Int32)) ) ENGINE = Log; + INSERT INTO sum_map VALUES ('2000-01-01', '2000-01-01 00:00:00', [1, 2, 3], [10, 10, 10], ([1, 2, 3], [10, 10, 10])), ('2000-01-01', '2000-01-01 00:00:00', [3, 4, 5], [10, 10, 10], ([3, 4, 5], [10, 10, 10])), @@ -47,3 +50,7 @@ GROUP BY timeslot │ 2000-01-01 00:01:00 │ ([4,5,6,7,8],[10,10,20,10,10]) │ ([4,5,6,7,8],[10,10,20,10,10]) │ └─────────────────────┴──────────────────────────────────────────────┴────────────────────────────────┘ ``` + +**See Also** + +- [-Map combinator for Map datatype](../combinators.md#-map) From 424a043c326cb8451bfc0da23fc2d1df2385fd4e Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 16:22:26 -0300 Subject: [PATCH 0383/1072] Update map.md --- docs/en/sql-reference/data-types/map.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/en/sql-reference/data-types/map.md b/docs/en/sql-reference/data-types/map.md index 0ea183d73d8..e0c8b98f9f8 100644 --- a/docs/en/sql-reference/data-types/map.md +++ b/docs/en/sql-reference/data-types/map.md @@ -108,6 +108,7 @@ Result: - [map()](../../sql-reference/functions/tuple-map-functions.md#function-map) function - [CAST()](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) function +- [-Map combinator for Map datatype](../aggregate-functions/combinators.md#-map) ## Related content From 7c64b1f26634acc8e93dd65c00b8b0dfb6419f1d Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 16:32:22 -0300 Subject: [PATCH 0384/1072] Update combinators.md --- .../aggregate-functions/combinators.md | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/combinators.md b/docs/en/sql-reference/aggregate-functions/combinators.md index e1db5d8d23e..fd693430064 100644 --- a/docs/en/sql-reference/aggregate-functions/combinators.md +++ b/docs/en/sql-reference/aggregate-functions/combinators.md @@ -30,7 +30,34 @@ Example 2: `uniqArray(arr)` – Counts the number of unique elements in all ‘a The -Map suffix can be appended to any aggregate function. This will create an aggregate function which gets Map type as an argument, and aggregates values of each key of the map separately using the specified aggregate function. The result is also of a Map type. -Examples: `sumMap(map(1,1))`, `avgMap(map('a', 1))`. +**Example** + +```sql +CREATE TABLE map_map( + date Date, + timeslot DateTime, + status Map(String, UInt64) +) ENGINE = Log; + +INSERT INTO map_map VALUES + ('2000-01-01', '2000-01-01 00:00:00', (['a', 'b', 'c'], [10, 10, 10])), + ('2000-01-01', '2000-01-01 00:00:00', (['c', 'd', 'e'], [10, 10, 10])), + ('2000-01-01', '2000-01-01 00:01:00', (['d', 'e', 'f'], [10, 10, 10])), + ('2000-01-01', '2000-01-01 00:01:00', (['f', 'g', 'g'], [10, 10, 10])); + +SELECT + timeslot, + sumMap(status), + avgMap(status), + minMap(status) +FROM map_map +GROUP BY timeslot; + +┌────────────timeslot─┬─sumMap(status)───────────────────────┬─avgMap(status)───────────────────────┬─minMap(status)───────────────────────┐ +│ 2000-01-01 00:00:00 │ {'a':10,'b':10,'c':20,'d':10,'e':10} │ {'a':10,'b':10,'c':10,'d':10,'e':10} │ {'a':10,'b':10,'c':10,'d':10,'e':10} │ +│ 2000-01-01 00:01:00 │ {'d':10,'e':10,'f':20,'g':20} │ {'d':10,'e':10,'f':10,'g':10} │ {'d':10,'e':10,'f':10,'g':10} │ +└─────────────────────┴──────────────────────────────────────┴──────────────────────────────────────┴──────────────────────────────────────┘ +``` ## -SimpleState From d81884be82bf32440cbaae3f63349716898aa1e3 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 17:13:25 -0300 Subject: [PATCH 0385/1072] Update summap.md --- docs/en/sql-reference/aggregate-functions/reference/summap.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/summap.md b/docs/en/sql-reference/aggregate-functions/reference/summap.md index d63e8b81716..0725b80c2a2 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/summap.md +++ b/docs/en/sql-reference/aggregate-functions/reference/summap.md @@ -5,7 +5,7 @@ sidebar_position: 141 # sumMap -Syntax: [sumMap(key , value )](../../data-types/array.md) or [sumMap(Tuple(key , value ))](../../data-types/tuple.md) +Syntax: `sumMap(key , value )` [Array](../../data-types/array.md) or `sumMap(Tuple(key , value ))` [Tuple](../../data-types/tuple.md) Alias: `sumMappedArrays`. From d93ee14e328ce0c8026a1d94a4233b9905fed793 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 17:15:16 -0300 Subject: [PATCH 0386/1072] Update summap.md --- docs/en/sql-reference/aggregate-functions/reference/summap.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/summap.md b/docs/en/sql-reference/aggregate-functions/reference/summap.md index 0725b80c2a2..aa12fad9559 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/summap.md +++ b/docs/en/sql-reference/aggregate-functions/reference/summap.md @@ -5,7 +5,7 @@ sidebar_position: 141 # sumMap -Syntax: `sumMap(key , value )` [Array](../../data-types/array.md) or `sumMap(Tuple(key , value ))` [Tuple](../../data-types/tuple.md) +Syntax: `sumMap(key <[Array](../../data-types/array.md)>, value <[Array](../../data-types/array.md)>)` or `sumMap([Tuple](../../data-types/tuple.md)(key , value ))` Alias: `sumMappedArrays`. From 7ca719c73a41f54c3b2032297e8d0b3f9f6dd7a2 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 17:19:07 -0300 Subject: [PATCH 0387/1072] Update summap.md --- docs/en/sql-reference/aggregate-functions/reference/summap.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/summap.md b/docs/en/sql-reference/aggregate-functions/reference/summap.md index aa12fad9559..baa25edc250 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/summap.md +++ b/docs/en/sql-reference/aggregate-functions/reference/summap.md @@ -5,7 +5,9 @@ sidebar_position: 141 # sumMap -Syntax: `sumMap(key <[Array](../../data-types/array.md)>, value <[Array](../../data-types/array.md)>)` or `sumMap([Tuple](../../data-types/tuple.md)(key , value ))` +Syntax: `sumMap(key , value )` [Array type](../../data-types/array.md) or `sumMap(Tuple(key , value ))` [Tuple type](../../data-types/tuple.md). + +Arguments: Alias: `sumMappedArrays`. From 0d9728410fd7631ff0e0755c749024de5b7ffdb1 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 17:31:12 -0300 Subject: [PATCH 0388/1072] Update first_value.md --- .../reference/first_value.md | 35 ++++++++++++++++--- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/first_value.md b/docs/en/sql-reference/aggregate-functions/reference/first_value.md index f343ca3f66c..15e0b113afd 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/first_value.md +++ b/docs/en/sql-reference/aggregate-functions/reference/first_value.md @@ -6,24 +6,32 @@ sidebar_position: 7 # first_value Selects the first encountered value, similar to `any`, but could accept NULL. +Mostly it should be used with [Window Functions](../../window-functions.md). +Without Window Functions the result will be random if the source stream is not ordered. ## examples ```sql -insert into test_data (a,b) values (1,null), (2,3), (4, 5), (6,null) +CREATE TABLE test_data +( + a Int64, + b Nullable(Int64) +) +ENGINE = Memory; + +INSERT INTO test_data (a, b) FORMAT Values (1,null), (2,3), (4, 5), (6,null); ``` ### example1 The NULL value is ignored at default. ```sql -select first_value(b) from test_data +select first_value(b) from test_data; ``` ```text ┌─first_value_ignore_nulls(b)─┐ │ 3 │ └─────────────────────────────┘ - ``` ### example2 @@ -36,7 +44,6 @@ select first_value(b) ignore nulls from test_data ┌─first_value_ignore_nulls(b)─┐ │ 3 │ └─────────────────────────────┘ - ``` ### example3 @@ -46,10 +53,28 @@ select first_value(b) respect nulls from test_data ``` ```text - ┌─first_value_respect_nulls(b)─┐ │ ᴺᵁᴸᴸ │ └──────────────────────────────┘ ``` +### example4 +Stabilized result using the sub-query with `ORDER BY`. +```sql +SELECT + first_value_respect_nulls(b), + first_value(b) +FROM +( + SELECT * + FROM test_data + ORDER BY a ASC +) +``` + +```text +┌─first_value_respect_nulls(b)─┬─first_value(b)─┐ +│ ᴺᵁᴸᴸ │ 3 │ +└──────────────────────────────┴────────────────┘ +``` From 2e26e84909098c97eab553eb7fcfa98b5a92bfae Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 17:34:31 -0300 Subject: [PATCH 0389/1072] Update last_value.md --- .../reference/last_value.md | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/last_value.md b/docs/en/sql-reference/aggregate-functions/reference/last_value.md index 7b6e14e4a55..77b4f3d1b60 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/last_value.md +++ b/docs/en/sql-reference/aggregate-functions/reference/last_value.md @@ -6,12 +6,20 @@ sidebar_position: 8 # last_value Selects the last encountered value, similar to `anyLast`, but could accept NULL. - +Mostly it should be used with [Window Functions](../../window-functions.md). +Without Window Functions the result will be random if the source stream is not ordered. ## examples ```sql -insert into test_data (a,b) values (1,null), (2,3), (4, 5), (6,null) +CREATE TABLE test_data +( + a Int64, + b Nullable(Int64) +) +ENGINE = Memory; + +INSERT INTO test_data (a, b) Values (1,null), (2,3), (4, 5), (6,null) ``` ### example1 @@ -50,4 +58,24 @@ select last_value(b) respect nulls from test_data └─────────────────────────────┘ ``` +### example4 +Stabilized result using the sub-query with `ORDER BY`. +```sql +SELECT + last_value_respect_nulls(b), + last_value(b) +FROM +( + SELECT * + FROM test_data + ORDER BY a ASC +) +``` + +```text +┌─last_value_respect_nulls(b)─┬─last_value(b)─┐ +│ ᴺᵁᴸᴸ │ 5 │ +└─────────────────────────────┴───────────────┘ +``` + From bd7a593dd3ae4447f23c8658a04fb79d164b9d84 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 17:34:46 -0300 Subject: [PATCH 0390/1072] Update first_value.md --- .../sql-reference/aggregate-functions/reference/first_value.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/first_value.md b/docs/en/sql-reference/aggregate-functions/reference/first_value.md index 15e0b113afd..6b764ec5739 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/first_value.md +++ b/docs/en/sql-reference/aggregate-functions/reference/first_value.md @@ -19,7 +19,7 @@ CREATE TABLE test_data ) ENGINE = Memory; -INSERT INTO test_data (a, b) FORMAT Values (1,null), (2,3), (4, 5), (6,null); +INSERT INTO test_data (a, b) Values (1,null), (2,3), (4, 5), (6,null); ``` ### example1 From fb21a6907a3eef7a054b44f9c81c5bc3a05f5cb7 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 17:38:05 -0300 Subject: [PATCH 0391/1072] Update last_value.md --- .../sql-reference/aggregate-functions/reference/last_value.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/last_value.md b/docs/en/sql-reference/aggregate-functions/reference/last_value.md index 77b4f3d1b60..21a86a5f130 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/last_value.md +++ b/docs/en/sql-reference/aggregate-functions/reference/last_value.md @@ -6,7 +6,7 @@ sidebar_position: 8 # last_value Selects the last encountered value, similar to `anyLast`, but could accept NULL. -Mostly it should be used with [Window Functions](../../window-functions.md). +Mostly it should be used with [Window Functions](../../window-functions/index.md). Without Window Functions the result will be random if the source stream is not ordered. ## examples From bcd89cbbf28c79e831f3e39a422319e5b6ea5915 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 17:38:38 -0300 Subject: [PATCH 0392/1072] Update first_value.md --- .../sql-reference/aggregate-functions/reference/first_value.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/first_value.md b/docs/en/sql-reference/aggregate-functions/reference/first_value.md index 6b764ec5739..c1965b23fe3 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/first_value.md +++ b/docs/en/sql-reference/aggregate-functions/reference/first_value.md @@ -6,7 +6,7 @@ sidebar_position: 7 # first_value Selects the first encountered value, similar to `any`, but could accept NULL. -Mostly it should be used with [Window Functions](../../window-functions.md). +Mostly it should be used with [Window Functions](../../window-functions/index.md). Without Window Functions the result will be random if the source stream is not ordered. ## examples From bf127f4e1e3a08de7ae822d0b53d25ad80899efa Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 3 Jun 2023 21:31:43 +0200 Subject: [PATCH 0393/1072] MSan support for Rust Previously you have to unpoison memory from the Rust, however Rust does supports MSan, so let's simply use it. But for this we need nightly Rust and recompile standard library. Signed-off-by: Azat Khuzhin --- docker/packager/binary/Dockerfile | 4 +++- rust/.cargo/config.toml.in | 7 +++++++ rust/BLAKE3/include/blake3.h | 2 -- rust/BLAKE3/src/lib.rs | 25 ------------------------- rust/CMakeLists.txt | 10 ++++++++++ src/Functions/FunctionsHashing.h | 7 +------ 6 files changed, 21 insertions(+), 34 deletions(-) diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile index fa860b2207f..dd21c8552d3 100644 --- a/docker/packager/binary/Dockerfile +++ b/docker/packager/binary/Dockerfile @@ -46,10 +46,12 @@ ENV CXX=clang++-${LLVM_VERSION} # Rust toolchain and libraries ENV RUSTUP_HOME=/rust/rustup ENV CARGO_HOME=/rust/cargo -ENV PATH="/rust/cargo/env:${PATH}" ENV PATH="/rust/cargo/bin:${PATH}" RUN curl https://sh.rustup.rs -sSf | bash -s -- -y && \ chmod 777 -R /rust && \ + rustup toolchain install nightly && \ + rustup default nightly && \ + rustup component add rust-src && \ rustup target add aarch64-unknown-linux-gnu && \ rustup target add x86_64-apple-darwin && \ rustup target add x86_64-unknown-freebsd && \ diff --git a/rust/.cargo/config.toml.in b/rust/.cargo/config.toml.in index a1dd966117b..db83145d449 100644 --- a/rust/.cargo/config.toml.in +++ b/rust/.cargo/config.toml.in @@ -1,3 +1,10 @@ [env] CFLAGS = "@RUST_CFLAGS@" CXXFLAGS = "@RUST_CXXFLAGS@" + +[build] +rustflags = @RUSTFLAGS@ +rustdocflags = @RUSTFLAGS@ + +[unstable] +@RUST_CARGO_BUILD_STD@ diff --git a/rust/BLAKE3/include/blake3.h b/rust/BLAKE3/include/blake3.h index 85572506d43..5dc7d5bd902 100644 --- a/rust/BLAKE3/include/blake3.h +++ b/rust/BLAKE3/include/blake3.h @@ -8,8 +8,6 @@ extern "C" { char *blake3_apply_shim(const char *begin, uint32_t _size, uint8_t *out_char_data); -char *blake3_apply_shim_msan_compat(const char *begin, uint32_t size, uint8_t *out_char_data); - void blake3_free_char_pointer(char *ptr_to_free); } // extern "C" diff --git a/rust/BLAKE3/src/lib.rs b/rust/BLAKE3/src/lib.rs index 2b54787589f..011145d2f71 100644 --- a/rust/BLAKE3/src/lib.rs +++ b/rust/BLAKE3/src/lib.rs @@ -3,7 +3,6 @@ extern crate libc; use std::ffi::{CStr, CString}; use std::os::raw::c_char; -use std::mem; #[no_mangle] pub unsafe extern "C" fn blake3_apply_shim( @@ -24,30 +23,6 @@ pub unsafe extern "C" fn blake3_apply_shim( std::ptr::null_mut() } -#[no_mangle] -pub unsafe extern "C" fn blake3_apply_shim_msan_compat( - mut begin: *const c_char, - size: u32, - out_char_data: *mut u8, -) -> *mut c_char { - if begin.is_null() { - let err_str = CString::new("input was a null pointer").unwrap(); - return err_str.into_raw(); - } - libc::memset(out_char_data as *mut libc::c_void, 0, mem::size_of::()); - let mut hasher = blake3::Hasher::new(); - let mut vec = Vec::::new(); - for _ in 0..size { - vec.push(*begin as u8); - begin = begin.add(1); - } - let input_res = vec.as_mut_slice(); - hasher.update(input_res); - let mut reader = hasher.finalize_xof(); - reader.fill(std::slice::from_raw_parts_mut(out_char_data, blake3::OUT_LEN)); - std::ptr::null_mut() -} - // Freeing memory according to docs: https://doc.rust-lang.org/std/ffi/struct.CString.html#method.into_raw #[no_mangle] pub unsafe extern "C" fn blake3_free_char_pointer(ptr_to_free: *mut c_char) { diff --git a/rust/CMakeLists.txt b/rust/CMakeLists.txt index d229894791a..6700ead9786 100644 --- a/rust/CMakeLists.txt +++ b/rust/CMakeLists.txt @@ -14,8 +14,18 @@ macro(configure_rustc) set(RUST_CFLAGS "${RUST_CFLAGS} --sysroot ${CMAKE_SYSROOT}") endif() + set(RUSTFLAGS "[]") + set(RUST_CARGO_BUILD_STD "") + # For more info: https://doc.rust-lang.org/beta/unstable-book/compiler-flags/sanitizer.html#memorysanitizer + if (SANITIZE STREQUAL "memory") + set(RUST_CARGO_BUILD_STD "build-std = [\"std\", \"panic_abort\", \"core\", \"alloc\"]") + set(RUSTFLAGS "[\"-Zsanitizer=memory\", \"-Zsanitizer-memory-track-origins\"]") + endif() + message(STATUS "RUST_CFLAGS: ${RUST_CFLAGS}") message(STATUS "RUST_CXXFLAGS: ${RUST_CXXFLAGS}") + message(STATUS "RUSTFLAGS: ${RUSTFLAGS}") + message(STATUS "RUST_CARGO_BUILD_STD: ${RUST_CARGO_BUILD_STD}") # NOTE: requires RW access for the source dir configure_file("${CMAKE_CURRENT_SOURCE_DIR}/.cargo/config.toml.in" "${CMAKE_CURRENT_SOURCE_DIR}/.cargo/config.toml" @ONLY) diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h index 32e3fbbd4ea..a4d4fbd085d 100644 --- a/src/Functions/FunctionsHashing.h +++ b/src/Functions/FunctionsHashing.h @@ -816,12 +816,7 @@ struct ImplBLAKE3 #else static void apply(const char * begin, const size_t size, unsigned char* out_char_data) { -# if defined(MEMORY_SANITIZER) - auto err_msg = blake3_apply_shim_msan_compat(begin, safe_cast(size), out_char_data); - __msan_unpoison(out_char_data, length); -# else - auto err_msg = blake3_apply_shim(begin, safe_cast(size), out_char_data); -# endif + auto err_msg = blake3_apply_shim(begin, safe_cast(size), out_char_data); if (err_msg != nullptr) { auto err_st = std::string(err_msg); From 045573e92565a6a58a7eae80cc11bb686807e9ee Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 18:10:34 -0300 Subject: [PATCH 0394/1072] Update datetime64.md --- .../en/sql-reference/data-types/datetime64.md | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/en/sql-reference/data-types/datetime64.md b/docs/en/sql-reference/data-types/datetime64.md index 2d4035831fa..da3d9dc4f65 100644 --- a/docs/en/sql-reference/data-types/datetime64.md +++ b/docs/en/sql-reference/data-types/datetime64.md @@ -63,7 +63,7 @@ SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Asia/ ``` text ┌───────────────timestamp─┬─event_id─┐ -│ 2019-01-01 00:00:00.000 │ 2 │ +│ 2019-01-01 00:00:00.000 │ 3 │ └─────────────────────────┴──────────┘ ``` @@ -75,8 +75,8 @@ SELECT * FROM dt WHERE timestamp = toDateTime64(1546300800.123, 3); ``` text ┌───────────────timestamp─┬─event_id─┐ -│ 2019-01-01 00:00:00.123 │ 1 │ -│ 2019-01-01 00:00:00.123 │ 2 │ +│ 2019-01-01 03:00:00.123 │ 1 │ +│ 2019-01-01 03:00:00.123 │ 2 │ └─────────────────────────┴──────────┘ ``` @@ -91,7 +91,7 @@ SELECT toDateTime64(now(), 3, 'Asia/Istanbul') AS column, toTypeName(column) AS ``` text ┌──────────────────column─┬─x──────────────────────────────┐ -│ 2019-10-16 04:12:04.000 │ DateTime64(3, 'Asia/Istanbul') │ +│ 2023-06-05 00:09:52.000 │ DateTime64(3, 'Asia/Istanbul') │ └─────────────────────────┴────────────────────────────────┘ ``` @@ -100,13 +100,14 @@ SELECT toDateTime64(now(), 3, 'Asia/Istanbul') AS column, toTypeName(column) AS ``` sql SELECT toDateTime64(timestamp, 3, 'Europe/London') as lon_time, -toDateTime64(timestamp, 3, 'Asia/Istanbul') as mos_time +toDateTime64(timestamp, 3, 'Asia/Istanbul') as istanbul_time FROM dt; ``` ``` text -┌───────────────lon_time──┬────────────────mos_time─┐ -│ 2019-01-01 00:00:00.000 │ 2019-01-01 03:00:00.000 │ +┌────────────────lon_time─┬───────────istanbul_time─┐ +│ 2019-01-01 00:00:00.123 │ 2019-01-01 03:00:00.123 │ +│ 2019-01-01 00:00:00.123 │ 2019-01-01 03:00:00.123 │ │ 2018-12-31 21:00:00.000 │ 2019-01-01 00:00:00.000 │ └─────────────────────────┴─────────────────────────┘ ``` @@ -115,10 +116,9 @@ FROM dt; - [Type conversion functions](../../sql-reference/functions/type-conversion-functions.md) - [Functions for working with dates and times](../../sql-reference/functions/date-time-functions.md) -- [Functions for working with arrays](../../sql-reference/functions/array-functions.md) -- [The `date_time_input_format` setting](../../operations/settings/settings.md#settings-date_time_input_format) -- [The `date_time_output_format` setting](../../operations/settings/settings.md#settings-date_time_output_format) +- [The `date_time_input_format` setting](../../operations/settings/formats.md#date_time_input_format) +- [The `date_time_output_format` setting](../../operations/settings/formats.md#date_time_output_format) - [The `timezone` server configuration parameter](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) -- [Operators for working with dates and times](../../sql-reference/operators/index.md#operators-datetime) +- [Operators for working with dates and times](../../sql-reference/operators/index.md#operators-for-working-with-dates-and-times) - [`Date` data type](../../sql-reference/data-types/date.md) - [`DateTime` data type](../../sql-reference/data-types/datetime.md) From 136efd68257ccaac503b2e47957803e283165afc Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 18:58:24 -0300 Subject: [PATCH 0395/1072] Update datetime64.md --- docs/en/sql-reference/data-types/datetime64.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/data-types/datetime64.md b/docs/en/sql-reference/data-types/datetime64.md index da3d9dc4f65..793691850b1 100644 --- a/docs/en/sql-reference/data-types/datetime64.md +++ b/docs/en/sql-reference/data-types/datetime64.md @@ -116,8 +116,8 @@ FROM dt; - [Type conversion functions](../../sql-reference/functions/type-conversion-functions.md) - [Functions for working with dates and times](../../sql-reference/functions/date-time-functions.md) -- [The `date_time_input_format` setting](../../operations/settings/formats.md#date_time_input_format) -- [The `date_time_output_format` setting](../../operations/settings/formats.md#date_time_output_format) +- [The `date_time_input_format` setting](../../operations/settings/settings-formats.md#date_time_input_format) +- [The `date_time_output_format` setting](../../operations/settings/settings-formats.md#date_time_output_format) - [The `timezone` server configuration parameter](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) - [Operators for working with dates and times](../../sql-reference/operators/index.md#operators-for-working-with-dates-and-times) - [`Date` data type](../../sql-reference/data-types/date.md) From aa35689cb10dbdbab0c8475a7f92b8978e6eb6b8 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 19:39:30 -0300 Subject: [PATCH 0396/1072] fix links in other lang-s --- docs/ru/operations/system-tables/query_log.md | 4 ++-- docs/ru/operations/system-tables/query_thread_log.md | 4 ++-- docs/ru/operations/system-tables/session_log.md | 2 +- docs/ru/operations/system-tables/zookeeper_log.md | 2 +- docs/ru/sql-reference/data-types/{domains => }/ipv4.md | 2 +- docs/ru/sql-reference/data-types/{domains => }/ipv6.md | 2 +- docs/ru/sql-reference/functions/ip-address-functions.md | 4 ++-- docs/zh/operations/system-tables/query_log.md | 4 ++-- docs/zh/operations/system-tables/query_thread_log.md | 4 ++-- docs/zh/operations/system-tables/zookeeper_log.md | 2 +- docs/zh/sql-reference/data-types/{domains => }/ipv4.md | 2 +- docs/zh/sql-reference/data-types/{domains => }/ipv6.md | 2 +- 12 files changed, 17 insertions(+), 17 deletions(-) rename docs/ru/sql-reference/data-types/{domains => }/ipv4.md (98%) rename docs/ru/sql-reference/data-types/{domains => }/ipv6.md (98%) rename docs/zh/sql-reference/data-types/{domains => }/ipv4.md (98%) rename docs/zh/sql-reference/data-types/{domains => }/ipv6.md (98%) diff --git a/docs/ru/operations/system-tables/query_log.md b/docs/ru/operations/system-tables/query_log.md index a55528bd829..8f858c14fb1 100644 --- a/docs/ru/operations/system-tables/query_log.md +++ b/docs/ru/operations/system-tables/query_log.md @@ -69,11 +69,11 @@ ClickHouse не удаляет данные из таблица автомати - 0 — запрос был инициирован другим запросом при выполнении распределенного запроса. - `user` ([String](../../sql-reference/data-types/string.md)) — пользователь, запустивший текущий запрос. - `query_id` ([String](../../sql-reference/data-types/string.md)) — ID запроса. -- `address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP адрес, с которого пришел запрос. +- `address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — IP адрес, с которого пришел запрос. - `port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — порт, с которого клиент сделал запрос - `initial_user` ([String](../../sql-reference/data-types/string.md)) — пользователь, запустивший первоначальный запрос (для распределенных запросов). - `initial_query_id` ([String](../../sql-reference/data-types/string.md)) — ID родительского запроса. -- `initial_address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP адрес, с которого пришел родительский запрос. +- `initial_address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — IP адрес, с которого пришел родительский запрос. - `initial_port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — порт, с которого клиент сделал родительский запрос. - `initial_query_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — время начала обработки запроса (для распределенных запросов). - `initial_query_start_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — время начала обработки запроса с точностью до микросекунд (для распределенных запросов). diff --git a/docs/ru/operations/system-tables/query_thread_log.md b/docs/ru/operations/system-tables/query_thread_log.md index c9aabb02cad..1a256e1657a 100644 --- a/docs/ru/operations/system-tables/query_thread_log.md +++ b/docs/ru/operations/system-tables/query_thread_log.md @@ -39,11 +39,11 @@ ClickHouse не удаляет данные из таблицы автомати - 0 — запрос был инициирован другим запросом при распределенном запросе. - `user` ([String](../../sql-reference/data-types/string.md)) — пользователь, запустивший текущий запрос. - `query_id` ([String](../../sql-reference/data-types/string.md)) — ID запроса. -- `address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP адрес, с которого пришел запрос. +- `address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — IP адрес, с которого пришел запрос. - `port` ([UInt16](../../sql-reference/data-types/int-uint.md#uint-ranges)) — порт, с которого пришел запрос. - `initial_user` ([String](../../sql-reference/data-types/string.md)) — пользователь, запустивший первоначальный запрос (для распределенных запросов). - `initial_query_id` ([String](../../sql-reference/data-types/string.md)) — ID родительского запроса. -- `initial_address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP адрес, с которого пришел родительский запрос. +- `initial_address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — IP адрес, с которого пришел родительский запрос. - `initial_port` ([UInt16](../../sql-reference/data-types/int-uint.md#uint-ranges)) — порт, пришел родительский запрос. - `interface` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — интерфейс, с которого ушёл запрос. Возможные значения: - 1 — TCP. diff --git a/docs/ru/operations/system-tables/session_log.md b/docs/ru/operations/system-tables/session_log.md index 1f313e7815a..5849cb51ab4 100644 --- a/docs/ru/operations/system-tables/session_log.md +++ b/docs/ru/operations/system-tables/session_log.md @@ -27,7 +27,7 @@ slug: /ru/operations/system-tables/session_log - `profiles` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — список профилей, установленных для всех ролей и (или) пользователей. - `roles` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — список ролей, к которым применяется данный профиль. - `settings` ([Array](../../sql-reference/data-types/array.md)([Tuple](../../sql-reference/data-types/tuple.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md), [String](../../sql-reference/data-types/string.md)))) — настройки, которые были изменены при входе или выходе клиента из системы. -- `client_address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP-адрес, который использовался для входа или выхода из системы. +- `client_address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — IP-адрес, который использовался для входа или выхода из системы. - `client_port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — порт клиента, который использовался для входа или выхода из системы. - `interface` ([Enum8](../../sql-reference/data-types/enum.md)) — интерфейс, с которого был инициирован вход в систему. Возможные значения: - `TCP` diff --git a/docs/ru/operations/system-tables/zookeeper_log.md b/docs/ru/operations/system-tables/zookeeper_log.md index ccbdd5110ad..9874cb3a269 100644 --- a/docs/ru/operations/system-tables/zookeeper_log.md +++ b/docs/ru/operations/system-tables/zookeeper_log.md @@ -15,7 +15,7 @@ slug: /ru/operations/system-tables/zookeeper_log - `Finalize` — соединение разорвано, ответ не получен. - `event_date` ([Date](../../sql-reference/data-types/date.md)) — дата, когда произошло событие. - `event_time` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — дата и время, когда произошло событие. -- `address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP адрес сервера ZooKeeper, с которого был сделан запрос. +- `address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — IP адрес сервера ZooKeeper, с которого был сделан запрос. - `port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — порт сервера ZooKeeper, с которого был сделан запрос. - `session_id` ([Int64](../../sql-reference/data-types/int-uint.md)) — идентификатор сессии, который сервер ZooKeeper создает для каждого соединения. - `xid` ([Int32](../../sql-reference/data-types/int-uint.md)) — идентификатор запроса внутри сессии. Обычно это последовательный номер запроса, одинаковый у строки запроса и у парной строки `response`/`finalize`. diff --git a/docs/ru/sql-reference/data-types/domains/ipv4.md b/docs/ru/sql-reference/data-types/ipv4.md similarity index 98% rename from docs/ru/sql-reference/data-types/domains/ipv4.md rename to docs/ru/sql-reference/data-types/ipv4.md index 57a19e282ae..8d308785eea 100644 --- a/docs/ru/sql-reference/data-types/domains/ipv4.md +++ b/docs/ru/sql-reference/data-types/ipv4.md @@ -1,5 +1,5 @@ --- -slug: /ru/sql-reference/data-types/domains/ipv4 +slug: /ru/sql-reference/data-types/ipv4 sidebar_position: 59 sidebar_label: IPv4 --- diff --git a/docs/ru/sql-reference/data-types/domains/ipv6.md b/docs/ru/sql-reference/data-types/ipv6.md similarity index 98% rename from docs/ru/sql-reference/data-types/domains/ipv6.md rename to docs/ru/sql-reference/data-types/ipv6.md index fdfb26f68c1..808068ce90a 100644 --- a/docs/ru/sql-reference/data-types/domains/ipv6.md +++ b/docs/ru/sql-reference/data-types/ipv6.md @@ -1,5 +1,5 @@ --- -slug: /ru/sql-reference/data-types/domains/ipv6 +slug: /ru/sql-reference/data-types/ipv6 sidebar_position: 60 sidebar_label: IPv6 --- diff --git a/docs/ru/sql-reference/functions/ip-address-functions.md b/docs/ru/sql-reference/functions/ip-address-functions.md index 96d4b737c88..d1a72b82b67 100644 --- a/docs/ru/sql-reference/functions/ip-address-functions.md +++ b/docs/ru/sql-reference/functions/ip-address-functions.md @@ -265,7 +265,7 @@ SELECT ## toIPv6 {#toipv6string} -Приводит строку с адресом в формате IPv6 к типу [IPv6](../../sql-reference/data-types/domains/ipv6.md). Возвращает пустое значение, если входящая строка не является корректным IP адресом. +Приводит строку с адресом в формате IPv6 к типу [IPv6](../../sql-reference/data-types/ipv6.md). Возвращает пустое значение, если входящая строка не является корректным IP адресом. Похоже на функцию [IPv6StringToNum](#ipv6stringtonums), которая представляет адрес IPv6 в двоичном виде. Если входящая строка содержит корректный IPv4 адрес, функция возвращает его IPv6 эквивалент. @@ -284,7 +284,7 @@ toIPv6(string) - IP адрес. -Тип: [IPv6](../../sql-reference/data-types/domains/ipv6.md). +Тип: [IPv6](../../sql-reference/data-types/ipv6.md). **Примеры** diff --git a/docs/zh/operations/system-tables/query_log.md b/docs/zh/operations/system-tables/query_log.md index 7149282dfcc..0ba669906cb 100644 --- a/docs/zh/operations/system-tables/query_log.md +++ b/docs/zh/operations/system-tables/query_log.md @@ -60,11 +60,11 @@ ClickHouse不会自动从表中删除数据。更多详情请看 [introduction]( - 0 — 由另一个查询发起的,作为分布式查询的一部分. - `user` ([String](../../sql-reference/data-types/string.md)) — 发起查询的用户. - `query_id` ([String](../../sql-reference/data-types/string.md)) — 查询ID. -- `address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — 发起查询的客户端IP地址. +- `address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — 发起查询的客户端IP地址. - `port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — 发起查询的客户端端口. - `initial_user` ([String](../../sql-reference/data-types/string.md)) — 初始查询的用户名(用于分布式查询执行). - `initial_query_id` ([String](../../sql-reference/data-types/string.md)) — 运行初始查询的ID(用于分布式查询执行). -- `initial_address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — 运行父查询的IP地址. +- `initial_address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — 运行父查询的IP地址. - `initial_port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — 发起父查询的客户端端口. - `interface` ([UInt8](../../sql-reference/data-types/int-uint.md)) — 发起查询的接口. 可能的值: - 1 — TCP. diff --git a/docs/zh/operations/system-tables/query_thread_log.md b/docs/zh/operations/system-tables/query_thread_log.md index 8a41c1501a6..c4b7e2f1043 100644 --- a/docs/zh/operations/system-tables/query_thread_log.md +++ b/docs/zh/operations/system-tables/query_thread_log.md @@ -36,11 +36,11 @@ ClickHouse不会自动从表中删除数据。 欲了解更多详情,请参照 - 0 — 由其他查询发起的分布式查询。 - `user` ([字符串](../../sql-reference/data-types/string.md)) — 发起查询的用户名。 - `query_id` ([字符串](../../sql-reference/data-types/string.md)) — 查询的ID。 -- `address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — 发起查询的IP地址。 +- `address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — 发起查询的IP地址。 - `port` ([UInt16](../../sql-reference/data-types/int-uint.md#uint-ranges)) — 发起查询的端口。 - `initial_user` ([字符串](../../sql-reference/data-types/string.md)) — 首次发起查询的用户名(对于分布式查询)。 - `initial_query_id` ([字符串](../../sql-reference/data-types/string.md)) — 首次发起查询的ID(对于分布式查询)。 -- `initial_address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — 发起该查询的父查询IP地址。 +- `initial_address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — 发起该查询的父查询IP地址。 - `initial_port` ([UInt16](../../sql-reference/data-types/int-uint.md#uint-ranges)) — 发起该查询的父查询端口。 - `interface` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — 发起查询的界面,可能的值: - 1 — TCP. diff --git a/docs/zh/operations/system-tables/zookeeper_log.md b/docs/zh/operations/system-tables/zookeeper_log.md index 59dcdaecdc1..ebc51a2e79d 100644 --- a/docs/zh/operations/system-tables/zookeeper_log.md +++ b/docs/zh/operations/system-tables/zookeeper_log.md @@ -15,7 +15,7 @@ slug: /zh/operations/system-tables/zookeeper_log - `Finalize` — 连接丢失, 未收到响应. - `event_date` ([Date](../../sql-reference/data-types/date.md)) — 事件发生的日期. - `event_time` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — 事件发生的日期和时间. -- `address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — 用于发出请求的 ZooKeeper 服务器的 IP 地址. +- `address` ([IPv6](../../sql-reference/data-types/ipv6.md)) — 用于发出请求的 ZooKeeper 服务器的 IP 地址. - `port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — 用于发出请求的 ZooKeeper 服务器的端口. - `session_id` ([Int64](../../sql-reference/data-types/int-uint.md)) — ZooKeeper 服务器为每个连接设置的会话 ID. - `xid` ([Int32](../../sql-reference/data-types/int-uint.md)) — 会话中请求的 ID. 这通常是一个连续的请求编号. 请求行和配对的 `response`/`finalize` 行相同. diff --git a/docs/zh/sql-reference/data-types/domains/ipv4.md b/docs/zh/sql-reference/data-types/ipv4.md similarity index 98% rename from docs/zh/sql-reference/data-types/domains/ipv4.md rename to docs/zh/sql-reference/data-types/ipv4.md index 69e17b2f617..b89af974b87 100644 --- a/docs/zh/sql-reference/data-types/domains/ipv4.md +++ b/docs/zh/sql-reference/data-types/ipv4.md @@ -1,5 +1,5 @@ --- -slug: /zh/sql-reference/data-types/domains/ipv4 +slug: /zh/sql-reference/data-types/ipv4 --- ## IPv4 {#ipv4} diff --git a/docs/zh/sql-reference/data-types/domains/ipv6.md b/docs/zh/sql-reference/data-types/ipv6.md similarity index 98% rename from docs/zh/sql-reference/data-types/domains/ipv6.md rename to docs/zh/sql-reference/data-types/ipv6.md index 9dd88692c37..3896bb873d8 100644 --- a/docs/zh/sql-reference/data-types/domains/ipv6.md +++ b/docs/zh/sql-reference/data-types/ipv6.md @@ -1,5 +1,5 @@ --- -slug: /zh/sql-reference/data-types/domains/ipv6 +slug: /zh/sql-reference/data-types/ipv6 --- ## IPv6 {#ipv6} From 2e187e0a0eae7f0109c6af30bd6baad0e75c9b71 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sun, 4 Jun 2023 20:12:35 -0300 Subject: [PATCH 0397/1072] try to fix redirect --- docs/redirects.txt | 2 -- docs/ru/sql-reference/data-types/ipv4.md | 27 +++--------------------- 2 files changed, 3 insertions(+), 26 deletions(-) diff --git a/docs/redirects.txt b/docs/redirects.txt index 98d6f6b8f7c..ddfc66aa48b 100644 --- a/docs/redirects.txt +++ b/docs/redirects.txt @@ -306,9 +306,7 @@ sql_reference/data_types/datetime64.md sql-reference/data-types/datetime64.md sql_reference/data_types/decimal.md sql-reference/data-types/decimal.md sql_reference/data_types/domains/index.md sql-reference/data-types/domains/index.md sql_reference/data_types/domains/ipv4.md sql-reference/data-types/ipv4.md -sql_reference/data-types/domains/ipv4.md sql-reference/data-types/ipv4.md sql_reference/data_types/domains/ipv6.md sql-reference/data-types/ipv6.md -sql_reference/data-types/domains/ipv6.md sql-reference/data-types/ipv6.md sql_reference/data_types/domains/overview.md sql-reference/data-types/domains/overview.md sql_reference/data_types/enum.md sql-reference/data-types/enum.md sql_reference/data_types/fixedstring.md sql-reference/data-types/fixedstring.md diff --git a/docs/ru/sql-reference/data-types/ipv4.md b/docs/ru/sql-reference/data-types/ipv4.md index 8d308785eea..5cb977c64c9 100644 --- a/docs/ru/sql-reference/data-types/ipv4.md +++ b/docs/ru/sql-reference/data-types/ipv4.md @@ -6,7 +6,7 @@ sidebar_label: IPv4 ## IPv4 {#ipv4} -`IPv4` — это домен, базирующийся на типе данных `UInt32` предназначенный для хранения адресов IPv4. Он обеспечивает компактное хранение данных с удобным для человека форматом ввода-вывода, и явно отображаемым типом данных в структуре таблицы. +IPv4-адреса. Хранится в 4 байтах как UInt32. ### Применение {#primenenie} @@ -57,27 +57,6 @@ SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; └──────────────────┴───────────┘ ``` -Значения с доменным типом данных не преобразуются неявно в другие типы данных, кроме `UInt32`. -Если необходимо преобразовать значение типа `IPv4` в строку, то это необходимо делать явно с помощью функции `IPv4NumToString()`: +**См. также** -``` sql -SELECT toTypeName(s), IPv4NumToString(from) AS s FROM hits LIMIT 1; -``` - -``` text -┌─toTypeName(IPv4NumToString(from))─┬─s──────────────┐ -│ String │ 183.247.232.58 │ -└───────────────────────────────────┴────────────────┘ -``` - -Или приводить к типу данных `UInt32`: - -``` sql -SELECT toTypeName(i), CAST(from AS UInt32) AS i FROM hits LIMIT 1; -``` - -``` text -┌─toTypeName(CAST(from, 'UInt32'))─┬──────────i─┐ -│ UInt32 │ 3086477370 │ -└──────────────────────────────────┴────────────┘ -``` +- [Functions for Working with IPv4 and IPv6 Addresses](../functions/ip-address-functions.md) From e5c95add52ed86c56249fe85d8f7c02132736ae3 Mon Sep 17 00:00:00 2001 From: auxten Date: Mon, 5 Jun 2023 08:43:55 +0800 Subject: [PATCH 0398/1072] use old_size Co-authored-by: Alexey Milovidov --- src/IO/WriteBufferFromVector.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IO/WriteBufferFromVector.h b/src/IO/WriteBufferFromVector.h index c793a34b406..a2ecc34f1ab 100644 --- a/src/IO/WriteBufferFromVector.h +++ b/src/IO/WriteBufferFromVector.h @@ -86,7 +86,7 @@ private: size_t old_size = vector.size(); /// pos may not be equal to vector.data() + old_size, because WriteBuffer::next() can be used to flush data size_t pos_offset = pos - reinterpret_cast(vector.data()); - if (pos_offset == vector.size()) + if (pos_offset == old_size) { vector.resize(old_size * size_multiplier); } From 4234c4f36addd2607ecc16131ec67ef1089d10ee Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 5 Jun 2023 02:51:11 +0200 Subject: [PATCH 0399/1072] Remove flaky test --- tests/integration/test_merge_tree_s3/test.py | 25 -------------------- 1 file changed, 25 deletions(-) diff --git a/tests/integration/test_merge_tree_s3/test.py b/tests/integration/test_merge_tree_s3/test.py index 7730bfcf7b2..2ccd517923a 100644 --- a/tests/integration/test_merge_tree_s3/test.py +++ b/tests/integration/test_merge_tree_s3/test.py @@ -739,31 +739,6 @@ def test_cache_with_full_disk_space(cluster, node_name): check_no_objects_after_drop(cluster, node_name=node_name) -@pytest.mark.parametrize("node_name", ["node"]) -def test_store_cleanup_disk_s3(cluster, node_name): - node = cluster.instances[node_name] - node.query("DROP TABLE IF EXISTS s3_test SYNC") - node.query( - "CREATE TABLE s3_test UUID '00000000-1000-4000-8000-000000000001' (n UInt64) Engine=MergeTree() ORDER BY n SETTINGS storage_policy='s3';" - ) - node.query("INSERT INTO s3_test SELECT 1") - - node.stop_clickhouse(kill=True) - path_to_data = "/var/lib/clickhouse/" - node.exec_in_container(["rm", f"{path_to_data}/metadata/default/s3_test.sql"]) - node.start_clickhouse() - - node.wait_for_log_line( - "Removing unused directory", timeout=90, look_behind_lines=1000 - ) - node.wait_for_log_line("directories from store") - node.query( - "CREATE TABLE s3_test UUID '00000000-1000-4000-8000-000000000001' (n UInt64) Engine=MergeTree() ORDER BY n SETTINGS storage_policy='s3';" - ) - node.query("INSERT INTO s3_test SELECT 1") - check_no_objects_after_drop(cluster) - - @pytest.mark.parametrize("node_name", ["node"]) def test_cache_setting_compatibility(cluster, node_name): node = cluster.instances[node_name] From 47379ac03965f4834bf6aaa00ce777dec731a3c9 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 5 Jun 2023 03:58:42 +0300 Subject: [PATCH 0400/1072] Update build.sh --- docker/packager/binary/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/packager/binary/build.sh b/docker/packager/binary/build.sh index ee1011a9cd5..c0803c74147 100755 --- a/docker/packager/binary/build.sh +++ b/docker/packager/binary/build.sh @@ -11,7 +11,7 @@ ccache_status () { [ -O /build ] || git config --global --add safe.directory /build -if [ "$EXTRACT_TOOLCHAIN_DARWIN" = "1" ];then +if [ "$EXTRACT_TOOLCHAIN_DARWIN" = "1" ]; then mkdir -p /build/cmake/toolchain/darwin-x86_64 tar xJf /MacOSX11.0.sdk.tar.xz -C /build/cmake/toolchain/darwin-x86_64 --strip-components=1 ln -sf darwin-x86_64 /build/cmake/toolchain/darwin-aarch64 From 5fc8838b04d37d26207fff488bd60127f9eedaa8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 5 Jun 2023 04:58:29 +0300 Subject: [PATCH 0401/1072] Update KeyCondition.cpp --- src/Storages/MergeTree/KeyCondition.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 923e5237420..16bd555092e 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -943,6 +943,13 @@ static FieldRef applyFunction(const FunctionBasePtr & func, const DataTypePtr & return {field.columns, field.row_idx, result_idx}; } +/** When table's key has expression with these functions from a column, + * and when a column in a query is compared with a constant, such as: + * CREATE TABLE (x String) ORDER BY toDate(x) + * SELECT ... WHERE x LIKE 'Hello%' + * we want to apply the function to the constant for index analysis, + * but should modify it to pass on unparseable values. + */ static std::set date_time_parsing_functions = { "toDate", "toDate32", From 3c5bd78856d1848cd457a30c2d8320b3f65a41d9 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 5 Jun 2023 06:13:39 +0200 Subject: [PATCH 0402/1072] Fix typo --- src/Storages/MergeTree/KeyCondition.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index ab5820be90a..02ef7e6bebd 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -948,7 +948,7 @@ static FieldRef applyFunction(const FunctionBasePtr & func, const DataTypePtr & * CREATE TABLE (x String) ORDER BY toDate(x) * SELECT ... WHERE x LIKE 'Hello%' * we want to apply the function to the constant for index analysis, - * but should modify it to pass on unparseable values. + * but should modify it to pass on unparsable values. */ static std::set date_time_parsing_functions = { "toDate", From c0f162c5b67b112691b8ef805bf2a56060441a0b Mon Sep 17 00:00:00 2001 From: johanngan Date: Fri, 26 May 2023 15:56:40 -0500 Subject: [PATCH 0403/1072] Add dictGetAll function for RegExpTreeDictionary This function outputs an array of attribute values from all regexp nodes that matched in a regexp tree dictionary. An optional final argument can be passed to limit the array size. --- docs/en/sql-reference/dictionaries/index.md | 63 ++++++++- .../functions/ext-dict-functions.md | 78 +++++++++++ src/Dictionaries/IDictionary.h | 45 +++++++ src/Dictionaries/RegExpTreeDictionary.cpp | 122 +++++++++++++++--- src/Dictionaries/RegExpTreeDictionary.h | 49 ++++++- .../FunctionsExternalDictionaries.cpp | 15 +++ src/Functions/FunctionsExternalDictionaries.h | 77 ++++++++--- ...04_regexp_dictionary_yaml_source.reference | 6 + .../02504_regexp_dictionary_yaml_source.sh | 48 +++++++ 9 files changed, 459 insertions(+), 44 deletions(-) diff --git a/docs/en/sql-reference/dictionaries/index.md b/docs/en/sql-reference/dictionaries/index.md index 43e9300c1ae..6c3d80683db 100644 --- a/docs/en/sql-reference/dictionaries/index.md +++ b/docs/en/sql-reference/dictionaries/index.md @@ -2280,7 +2280,7 @@ This config consists of a list of regular expression tree nodes. Each node has t - The value of an attribute may contain **back references**, referring to capture groups of the matched regular expression. In the example, the value of attribute `version` in the first node consists of a back-reference `\1` to capture group `(\d+[\.\d]*)` in the regular expression. Back-reference numbers range from 1 to 9 and are written as `$1` or `\1` (for number 1). The back reference is replaced by the matched capture group during query execution. - **child nodes**: a list of children of a regexp tree node, each of which has its own attributes and (potentially) children nodes. String matching proceeds in a depth-first fashion. If a string matches a regexp node, the dictionary checks if it also matches the nodes' child nodes. If that is the case, the attributes of the deepest matching node are assigned. Attributes of a child node overwrite equally named attributes of parent nodes. The name of child nodes in YAML files can be arbitrary, e.g. `versions` in above example. -Regexp tree dictionaries only allow access using the functions `dictGet` and `dictGetOrDefault`. +Regexp tree dictionaries only allow access using the functions `dictGet`, `dictGetOrDefault`, and `dictGetAll`. Example: @@ -2300,6 +2300,67 @@ In this case, we first match the regular expression `\d+/tclwebkit(?:\d+[\.\d]*) With a powerful YAML configure file, we can use a regexp tree dictionaries as a user agent string parser. We support [uap-core](https://github.com/ua-parser/uap-core) and demonstrate how to use it in the functional test [02504_regexp_dictionary_ua_parser](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/02504_regexp_dictionary_ua_parser.sh) +#### Collecting Attribute Values + +Sometimes it is useful to return values from multiple regular expressions that matched, rather than just the value of a leaf node. In these cases, the specialized [`dictGetAll`](../../sql-reference/functions/ext-dict-functions.md#dictgetall) function can be used. If a node has an attribute value of type `T`, `dictGetAll` will return an `Array(T)` containing zero or more values. + +By default, the number of matches returned per key is unbounded. A bound can be passed as an optional fourth argument to `dictGetAll`. The array is populated in _topological order_, meaning that child nodes come before parent nodes, and sibling nodes follow the ordering in the source. + +Example: + +```sql +CREATE DICTIONARY regexp_dict +( + regexp String, + tag String, + topological_index Int64, + captured Nullable(String), + parent String +) +PRIMARY KEY(regexp) +SOURCE(YAMLRegExpTree(PATH '/var/lib/clickhouse/user_files/regexp_tree.yaml')) +LAYOUT(regexp_tree) +LIFETIME(0) +``` + +```yaml +# /var/lib/clickhouse/user_files/regexp_tree.yaml +- regexp: 'clickhouse\.com' + tag: 'ClickHouse' + topological_index: 1 + paths: + - regexp: 'clickhouse\.com/docs(.*)' + tag: 'ClickHouse Documentation' + topological_index: 0 + captured: '\1' + parent: 'ClickHouse' + +- regexp: '/docs(/|$)' + tag: 'Documentation' + topological_index: 2 + +- regexp: 'github.com' + tag: 'GitHub' + topological_index: 3 + captured: 'NULL' +``` + +```sql +CREATE TABLE urls (url String) ENGINE=MergeTree ORDER BY url; +INSERT INTO urls VALUES ('clickhouse.com'), ('clickhouse.com/docs/en'), ('github.com/clickhouse/tree/master/docs'); +SELECT url, dictGetAll('regexp_dict', ('tag', 'topological_index', 'captured', 'parent'), url, 2) FROM urls; +``` + +Result: + +```text +┌─url────────────────────────────────────┬─dictGetAll('regexp_dict', ('tag', 'topological_index', 'captured', 'parent'), url, 2)─┐ +│ clickhouse.com │ (['ClickHouse'],[1],[],[]) │ +│ clickhouse.com/docs/en │ (['ClickHouse Documentation','ClickHouse'],[0,1],['/en'],['ClickHouse']) │ +│ github.com/clickhouse/tree/master/docs │ (['Documentation','GitHub'],[2,3],[NULL],[]) │ +└────────────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────┘ +``` + ### Use Regular Expression Tree Dictionary in ClickHouse Cloud Above used `YAMLRegExpTree` source works in ClickHouse Open Source but not in ClickHouse Cloud. To use regexp tree dictionaries in ClickHouse could, first create a regexp tree dictionary from a YAML file locally in ClickHouse Open Source, then dump this dictionary into a CSV file using the `dictionary` table function and the [INTO OUTFILE](../statements/select/into-outfile.md) clause. diff --git a/docs/en/sql-reference/functions/ext-dict-functions.md b/docs/en/sql-reference/functions/ext-dict-functions.md index 7d8aa2c0390..284d6d80405 100644 --- a/docs/en/sql-reference/functions/ext-dict-functions.md +++ b/docs/en/sql-reference/functions/ext-dict-functions.md @@ -403,6 +403,84 @@ SELECT dictGetDescendants('hierarchy_flat_dictionary', number, 1) FROM system.nu └────────────────────────────────────────────────────────────┘ ``` + +## dictGetAll + +Retrieves the attribute values of all nodes that matched each key in a [regular expression tree dictionary](../../sql-reference/dictionaries/index.md#regexp-tree-dictionary). + +Besides returning values of type `Array(T)` instead of `T`, this function behaves similarly to [`dictGet`](#dictget-dictgetordefault-dictgetornull). + +**Syntax** + +``` sql +dictGetAll('dict_name', attr_names, id_expr[, limit]) +``` + +**Arguments** + +- `dict_name` — Name of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal). +- `attr_names` — Name of the column of the dictionary, [String literal](../../sql-reference/syntax.md#syntax-string-literal), or tuple of column names, [Tuple](../../sql-reference/data-types/tuple.md)([String literal](../../sql-reference/syntax.md#syntax-string-literal)). +- `id_expr` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning array of dictionary key-type value or [Tuple](../../sql-reference/data-types/tuple.md)-type value depending on the dictionary configuration. +- `limit` - Maximum length for each value array returned. When truncating, child nodes are given precedence over parent nodes, and otherwise the defined list order for the regexp tree dictionary is respected. If unspecified, array length is unlimited. + +**Returned value** + +- If ClickHouse parses the attribute successfully in the attribute’s data type as defined in the dictionary, returns an array of dictionary attribute values that correspond to `id_expr` for each attribute specified by `attr_names`. + +- If there is no key corresponding to `id_expr` in the dictionary, then an empty array is returned. + +ClickHouse throws an exception if it cannot parse the value of the attribute or the value does not match the attribute data type. + +**Example** + +Consider the following regexp tree dictionary: + +```sql +CREATE DICTIONARY regexp_dict +( + regexp String, + tag String +) +PRIMARY KEY(regexp) +SOURCE(YAMLRegExpTree(PATH '/var/lib/clickhouse/user_files/regexp_tree.yaml')) +LAYOUT(regexp_tree) +... +``` + +```yaml +# /var/lib/clickhouse/user_files/regexp_tree.yaml +- regexp: 'foo' + tag: 'foo_attr' +- regexp: 'bar' + tag: 'bar_attr' +- regexp: 'baz' + tag: 'baz_attr' +``` + +Get all matching values: + +```sql +SELECT dictGetAll('regexp_dict', 'tag', 'foobarbaz'); +``` + +```text +┌─dictGetAll('regexp_dict', 'tag', 'foobarbaz')─┐ +│ ['foo_attr','bar_attr','baz_attr'] │ +└───────────────────────────────────────────────┘ +``` + +Get up to 2 matching values: + +```sql +SELECT dictGetAll('regexp_dict', 'tag', 'foobarbaz', 2); +``` + +```text +┌─dictGetAll('regexp_dict', 'tag', 'foobarbaz', 2)─┐ +│ ['foo_attr','bar_attr'] │ +└──────────────────────────────────────────────────┘ +``` + ## Other Functions ClickHouse supports specialized functions that convert dictionary attribute values to a specific data type regardless of the dictionary configuration. diff --git a/src/Dictionaries/IDictionary.h b/src/Dictionaries/IDictionary.h index ee18e8b9a7e..f1834b4b129 100644 --- a/src/Dictionaries/IDictionary.h +++ b/src/Dictionaries/IDictionary.h @@ -207,6 +207,51 @@ public: return result; } + /** + * Analogous to getColumn, but for dictGetAll + */ + virtual ColumnPtr getColumnAllValues( + const std::string & attribute_name [[maybe_unused]], + const DataTypePtr & result_type [[maybe_unused]], + const Columns & key_columns [[maybe_unused]], + const DataTypes & key_types [[maybe_unused]], + const ColumnPtr & default_values_column [[maybe_unused]], + size_t limit [[maybe_unused]]) const + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Method getColumnAllValues is not supported for {} dictionary.", + getDictionaryID().getNameForLogs()); + } + + /** + * Analogous to getColumns, but for dictGetAll + */ + virtual Columns getColumnsAllValues( + const Strings & attribute_names, + const DataTypes & result_types, + const Columns & key_columns, + const DataTypes & key_types, + const Columns & default_values_columns, + size_t limit) const + { + size_t attribute_names_size = attribute_names.size(); + + Columns result; + result.reserve(attribute_names_size); + + for (size_t i = 0; i < attribute_names_size; ++i) + { + const auto & attribute_name = attribute_names[i]; + const auto & result_type = result_types[i]; + const auto & default_values_column = default_values_columns[i]; + + result.emplace_back(getColumnAllValues( + attribute_name, result_type, key_columns, key_types, default_values_column, limit)); + } + + return result; + } + /** Subclass must validate key columns and key types and return ColumnUInt8 that * is bitmask representation of is key in dictionary or not. * If key is in dictionary then value of associated row will be 1, otherwise 0. diff --git a/src/Dictionaries/RegExpTreeDictionary.cpp b/src/Dictionaries/RegExpTreeDictionary.cpp index 9841cadcdca..8d0af9b0abf 100644 --- a/src/Dictionaries/RegExpTreeDictionary.cpp +++ b/src/Dictionaries/RegExpTreeDictionary.cpp @@ -70,7 +70,7 @@ namespace explicit StringPiece(int ref_) : ref_num(ref_) {} }; - Field parseStringToField(const String & raw, DataTypePtr data_type) + Field parseStringToField(const String & raw, const DataTypePtr data_type) try { ReadBufferFromString buffer(raw); @@ -419,6 +419,65 @@ RegExpTreeDictionary::RegExpTreeDictionary( calculateBytesAllocated(); } +// Thin wrapper around unordered_map that manages the collection of attribute values subject to the +// behavior specified by collect_values_limit +class RegExpTreeDictionary::AttributeCollector : public std::unordered_map +{ +private: + std::optional collect_values_limit; // std::nullopt means single-value mode, i.e. don't collect + size_t n_full_attributes; + +public: + explicit AttributeCollector(std::optional collect_values_limit_) + : collect_values_limit(collect_values_limit_), n_full_attributes(0) + { + } + + constexpr bool collecting() const { return collect_values_limit != std::nullopt; } + + // Add a name-value pair to the collection if there's space + void add(const String & attr_name, Field field) + { + if (collect_values_limit) + { + if (!this->contains(attr_name)) + (*this)[attr_name] = Array(); + + Array & values = (*this)[attr_name].safeGet(); + if (values.size() < *collect_values_limit) + { + values.push_back(std::move(field)); + if (values.size() == *collect_values_limit) + n_full_attributes++; + } + } + else if (!this->contains(attr_name)) + { + (*this)[attr_name] = std::move(field); + n_full_attributes++; + } + } + + // Checks if no more values can be added for a given attribute + inline bool full(const String & attr_name) const + { + if (collect_values_limit) + { + auto it = this->find(attr_name); + if (it == this->end()) + return false; + return it->second.safeGet().size() >= *collect_values_limit; + } + else + { + return this->contains(attr_name); + } + } + + // Returns the number of full attributes + inline size_t attributesFull() const { return n_full_attributes; } +}; + std::pair processBackRefs(const String & data, const re2_st::RE2 & searcher, const std::vector & pieces) { re2_st::StringPiece haystack(data.data(), data.size()); @@ -442,7 +501,7 @@ std::pair processBackRefs(const String & data, const re2_st::RE2 & // The return value means whether we finish collecting. bool RegExpTreeDictionary::setAttributes( UInt64 id, - std::unordered_map & attributes_to_set, + AttributeCollector & attributes_to_set, const String & data, std::unordered_set & visited_nodes, const std::unordered_map & attributes, @@ -451,34 +510,43 @@ bool RegExpTreeDictionary::setAttributes( { if (visited_nodes.contains(id)) - return attributes_to_set.size() == attributes.size(); + return attributes_to_set.attributesFull() == attributes.size(); visited_nodes.emplace(id); const auto & node_attributes = regex_nodes.at(id)->attributes; for (const auto & [name_, value] : node_attributes) { - if (!attributes.contains(name_) || attributes_to_set.contains(name_)) + if (!attributes.contains(name_) || attributes_to_set.full(name_)) continue; + if (value.containsBackRefs()) { auto [updated_str, use_default] = processBackRefs(data, regex_nodes.at(id)->searcher, value.pieces); if (use_default) { - DefaultValueProvider default_value(attributes.at(name_).null_value, defaults.at(name_)); - attributes_to_set[name_] = default_value.getDefaultValue(key_index); + // Back-ref processing failed. + // - If not collecting values, set the default value immediately while we're still on this node. + // Otherwise, a value from a different node could take its place before we set it to the default value post-walk. + // - If collecting values, don't add anything. If we find no other matches for this attribute, + // then we'll set its value to the default Array value later. + if (!attributes_to_set.collecting()) + { + DefaultValueProvider default_value(attributes.at(name_).null_value, defaults.at(name_)); + attributes_to_set.add(name_, default_value.getDefaultValue(key_index)); + } } else - attributes_to_set[name_] = parseStringToField(updated_str, attributes.at(name_).type); + attributes_to_set.add(name_, parseStringToField(updated_str, attributes.at(name_).type)); } else - attributes_to_set[name_] = value.field; + attributes_to_set.add(name_, value.field); } auto parent_id = regex_nodes.at(id)->parent_id; if (parent_id > 0) setAttributes(parent_id, attributes_to_set, data, visited_nodes, attributes, defaults, key_index); - /// if all the attributes have set, the walking through can be stopped. - return attributes_to_set.size() == attributes.size(); + /// if all attributes are full, we can stop walking the tree + return attributes_to_set.attributesFull() == attributes.size(); } /// a temp struct to store all the matched result. @@ -550,7 +618,8 @@ std::unordered_map RegExpTreeDictionary::match( const ColumnString::Chars & keys_data, const ColumnString::Offsets & keys_offsets, const std::unordered_map & attributes, - const std::unordered_map & defaults) const + const std::unordered_map & defaults, + std::optional collect_values_limit) const { #if USE_VECTORSCAN @@ -573,7 +642,7 @@ std::unordered_map RegExpTreeDictionary::match( /// initialize columns for (const auto & [name_, attr] : attributes) { - auto col_ptr = attr.type->createColumn(); + auto col_ptr = (collect_values_limit ? std::make_shared(attr.type) : attr.type)->createColumn(); col_ptr->reserve(keys_offsets.size()); columns[name_] = std::move(col_ptr); } @@ -630,11 +699,11 @@ std::unordered_map RegExpTreeDictionary::match( match_result.sort(); /// Walk through the regex tree util all attributes are set; - std::unordered_map attributes_to_set; + AttributeCollector attributes_to_set{collect_values_limit}; std::unordered_set visited_nodes; /// Some node matches but its parents cannot match. In this case we must regard this node unmatched. - auto is_invalid = [&](UInt64 id) + auto is_valid = [&](UInt64 id) { while (id) { @@ -650,7 +719,7 @@ std::unordered_map RegExpTreeDictionary::match( for (auto item : match_result.matched_idx_sorted_list) { UInt64 id = item.second; - if (!is_invalid(id)) + if (!is_valid(id)) continue; if (visited_nodes.contains(id)) continue; @@ -663,7 +732,8 @@ std::unordered_map RegExpTreeDictionary::match( if (attributes_to_set.contains(name_)) continue; - DefaultValueProvider default_value(attr.null_value, defaults.at(name_)); + DefaultValueProvider default_value( + collect_values_limit ? DataTypeArray(attr.type).getDefault() : attr.null_value, defaults.at(name_)); columns[name_]->insert(default_value.getDefaultValue(key_idx)); } @@ -727,12 +797,13 @@ Pipe RegExpTreeDictionary::read(const Names & , size_t max_block_size, size_t) c return Pipe(std::make_shared(std::move(result))); } -Columns RegExpTreeDictionary::getColumns( +Columns RegExpTreeDictionary::getColumnsImpl( const Strings & attribute_names, const DataTypes & result_types, const Columns & key_columns, const DataTypes & key_types, - const Columns & default_values_columns) const + const Columns & default_values_columns, + std::optional collect_values_limit) const { /// valid check if (key_columns.size() != 1) @@ -746,7 +817,17 @@ Columns RegExpTreeDictionary::getColumns( for (size_t i = 0; i < attribute_names.size(); i++) { - const auto & attribute = structure.getAttribute(attribute_names[i], result_types[i]); + DataTypePtr attribute_type = result_types[i]; + if (collect_values_limit) + { + if (!WhichDataType(attribute_type).isArray()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, "Expected Array result type for attribute `{}`, got `{}`", + attribute_names[i], + attribute_type->getName()); + attribute_type = assert_cast(*attribute_type).getNestedType(); + } + const auto & attribute = structure.getAttribute(attribute_names[i], attribute_type); attributes.emplace(attribute.name, attribute); defaults[attribute.name] = default_values_columns[i]; } @@ -757,7 +838,8 @@ Columns RegExpTreeDictionary::getColumns( key_column->getChars(), key_column->getOffsets(), attributes, - defaults); + defaults, + collect_values_limit); Columns result; for (const String & name_ : attribute_names) diff --git a/src/Dictionaries/RegExpTreeDictionary.h b/src/Dictionaries/RegExpTreeDictionary.h index 683588e688f..30966184eb6 100644 --- a/src/Dictionaries/RegExpTreeDictionary.h +++ b/src/Dictionaries/RegExpTreeDictionary.h @@ -101,16 +101,50 @@ public: const Columns & key_columns, const DataTypes & key_types, const ColumnPtr & default_values_column) const override - { - return getColumns(Strings({attribute_name}), DataTypes({result_type}), key_columns, key_types, Columns({default_values_column}))[0]; - } + { + return getColumns(Strings({attribute_name}), DataTypes({result_type}), key_columns, key_types, Columns({default_values_column}))[0]; + } Columns getColumns( const Strings & attribute_names, const DataTypes & result_types, const Columns & key_columns, const DataTypes & key_types, - const Columns & default_values_columns) const override; + const Columns & default_values_columns) const override + { + return getColumnsImpl(attribute_names, result_types, key_columns, key_types, default_values_columns, std::nullopt); + } + + ColumnPtr getColumnAllValues( + const std::string & attribute_name, + const DataTypePtr & result_type, + const Columns & key_columns, + const DataTypes & key_types, + const ColumnPtr & default_values_column, + size_t limit) const override + { + return getColumnsAllValues( + Strings({attribute_name}), DataTypes({result_type}), key_columns, key_types, Columns({default_values_column}), limit)[0]; + } + + Columns getColumnsAllValues( + const Strings & attribute_names, + const DataTypes & result_types, + const Columns & key_columns, + const DataTypes & key_types, + const Columns & default_values_columns, + size_t limit) const override + { + return getColumnsImpl(attribute_names, result_types, key_columns, key_types, default_values_columns, limit); + } + + Columns getColumnsImpl( + const Strings & attribute_names, + const DataTypes & result_types, + const Columns & key_columns, + const DataTypes & key_types, + const Columns & default_values_columns, + std::optional collect_values_limit) const; private: const DictionaryStructure structure; @@ -137,11 +171,14 @@ private: const ColumnString::Chars & keys_data, const ColumnString::Offsets & keys_offsets, const std::unordered_map & attributes, - const std::unordered_map & defaults) const; + const std::unordered_map & defaults, + std::optional collect_values_limit) const; + + class AttributeCollector; bool setAttributes( UInt64 id, - std::unordered_map & attributes_to_set, + AttributeCollector & attributes_to_set, const String & data, std::unordered_set & visited_nodes, const std::unordered_map & attributes, diff --git a/src/Functions/FunctionsExternalDictionaries.cpp b/src/Functions/FunctionsExternalDictionaries.cpp index 70b1e3cc861..9fa08c82d41 100644 --- a/src/Functions/FunctionsExternalDictionaries.cpp +++ b/src/Functions/FunctionsExternalDictionaries.cpp @@ -45,11 +45,26 @@ Accepts 3 parameters: Returned value: value of the dictionary attribute parsed in the attribute’s data type if key is found, otherwise NULL. Throws an exception if cannot parse the value of the attribute or the value does not match the attribute data type. +)" }; + + constexpr auto dict_get_all_description { R"( +Retrieves all values from a dictionary corresponding to the given key values. + +Accepts 3 or 4 parameters: +-- name of the dictionary; +-- name of the column of the dictionary or tuple of column names; +-- key value - expression returning dictionary key-type value or tuple-type value - depending on the dictionary configuration; +-- [optional] maximum number of values to return for each attribute; + +Returned value: array of dictionary attribute values parsed in the attribute's data type if key is found, otherwise empty array. + +Throws an exception if cannot parse the value of the attribute, the value does not match the attribute data type, or the dictionary doesn't support this function. )" }; factory.registerFunction>(FunctionDocumentation{ .description=fmt::format(dict_get_description, "attribute’s data type") }); factory.registerFunction>(FunctionDocumentation{ .description=fmt::format(dict_get_or_default_description, "attribute’s data type") }); factory.registerFunction(FunctionDocumentation{ .description=dict_get_or_null_description }); + factory.registerFunction>(FunctionDocumentation{ .description=dict_get_all_description }); factory.registerFunction(FunctionDocumentation{ .description=fmt::format(dict_get_description, "UInt8") }); factory.registerFunction(FunctionDocumentation{ .description=fmt::format(dict_get_description, "UInt16") }); diff --git a/src/Functions/FunctionsExternalDictionaries.h b/src/Functions/FunctionsExternalDictionaries.h index 97d85f384bc..e4529ff1765 100644 --- a/src/Functions/FunctionsExternalDictionaries.h +++ b/src/Functions/FunctionsExternalDictionaries.h @@ -296,7 +296,8 @@ private: enum class DictionaryGetFunctionType { get, - getOrDefault + getOrDefault, + getAll }; /// This variant of function derives the result type automatically. @@ -304,7 +305,10 @@ template class FunctionDictGetNoType final : public IFunction { public: - static constexpr auto name = dictionary_get_function_type == DictionaryGetFunctionType::get ? "dictGet" : "dictGetOrDefault"; + // Kind of gross but we need a static field called "name" for FunctionFactory::registerFunction, and this is the easiest way + static constexpr auto name = (dictionary_get_function_type == DictionaryGetFunctionType::get) + ? "dictGet" + : ((dictionary_get_function_type == DictionaryGetFunctionType::getOrDefault) ? "dictGetOrDefault" : "dictGetAll"); static FunctionPtr create(ContextPtr context) { @@ -321,7 +325,13 @@ public: bool useDefaultImplementationForConstants() const final { return true; } bool useDefaultImplementationForNulls() const final { return false; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const final { return {0, 1}; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const final + { + if constexpr (dictionary_get_function_type == DictionaryGetFunctionType::getAll) + return {0, 1, 3}; + else + return {0, 1}; + } bool isDeterministic() const override { return false; } @@ -360,6 +370,15 @@ public: } bool key_is_nullable = arguments[2].type->isNullable(); + if (dictionary_get_function_type == DictionaryGetFunctionType::getAll) + { + if (key_is_nullable) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Function {} does not support nullable keys", getName()); + + // Wrap all the attribute types in Array() + for (auto it = attribute_types.begin(); it != attribute_types.end(); ++it) + *it = std::make_shared(*it); + } if (attribute_types.size() > 1) { if (key_is_nullable) @@ -424,6 +443,7 @@ public: } Columns default_cols; + size_t collect_values_limit = std::numeric_limits::max(); if (dictionary_get_function_type == DictionaryGetFunctionType::getOrDefault) { @@ -464,6 +484,19 @@ public: } else { + if (dictionary_get_function_type == DictionaryGetFunctionType::getAll && current_arguments_index < arguments.size()) + { + auto limit_col = arguments[current_arguments_index].column; + if (!limit_col || !isColumnConst(*limit_col)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of fourth argument of function {}. Expected const unsigned integer.", + arguments[current_arguments_index].type->getName(), + getName()); + + collect_values_limit = limit_col->getUInt(0); + ++current_arguments_index; + } + for (size_t i = 0; i < attribute_names.size(); ++i) default_cols.emplace_back(nullptr); } @@ -549,7 +582,8 @@ public: attribute_type = attribute_types.front(); } - auto result_column = executeDictionaryRequest(dictionary, attribute_names, key_columns, key_types, attribute_type, default_cols); + auto result_column = executeDictionaryRequest( + dictionary, attribute_names, key_columns, key_types, attribute_type, default_cols, collect_values_limit); if (key_is_nullable) result_column = wrapInNullable(result_column, {arguments[2]}, result_type, input_rows_count); @@ -565,7 +599,8 @@ private: const Columns & key_columns, const DataTypes & key_types, const DataTypePtr & result_type, - const Columns & default_cols) const + const Columns & default_cols, + size_t collect_values_limit) const { ColumnPtr result; @@ -573,23 +608,31 @@ private: { const auto & result_tuple_type = assert_cast(*result_type); - Columns result_columns = dictionary->getColumns( - attribute_names, - result_tuple_type.getElements(), - key_columns, - key_types, - default_cols); + Columns result_columns; + if (dictionary_get_function_type == DictionaryGetFunctionType::getAll) + { + result_columns = dictionary->getColumnsAllValues( + attribute_names, result_tuple_type.getElements(), key_columns, key_types, default_cols, collect_values_limit); + } + else + { + result_columns + = dictionary->getColumns(attribute_names, result_tuple_type.getElements(), key_columns, key_types, default_cols); + } result = ColumnTuple::create(std::move(result_columns)); } else { - result = dictionary->getColumn( - attribute_names[0], - result_type, - key_columns, - key_types, - default_cols.front()); + if (dictionary_get_function_type == DictionaryGetFunctionType::getAll) + { + result = dictionary->getColumnAllValues( + attribute_names[0], result_type, key_columns, key_types, default_cols.front(), collect_values_limit); + } + else + { + result = dictionary->getColumn(attribute_names[0], result_type, key_columns, key_types, default_cols.front()); + } } return result; diff --git a/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.reference b/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.reference index dfcd170e8f4..437012dd516 100644 --- a/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.reference +++ b/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.reference @@ -5,3 +5,9 @@ ('BlackBerry WebKit','10.0') ('BlackBerry WebKit','1.0') (true,'61f0c404-5cb3-11e7-907b-a6006ad3dba0','2023-01-01','2023-01-01 01:01:01',[1,2,3,-1,-2,-3]) +(['ClickHouse'],[1],[],[]) +(['ClickHouse'],[1],[],[]) +(['ClickHouse Documentation','ClickHouse','Documentation'],[0,1,2],['/en'],['ClickHouse']) +(['ClickHouse Documentation','ClickHouse'],[0,1],['/en'],['ClickHouse']) +(['Documentation','GitHub'],[2,3],[NULL],[]) +(['Documentation','GitHub'],[2,3],[NULL],[]) diff --git a/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh b/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh index 1b5a9cdeea4..ac0793460a9 100755 --- a/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh +++ b/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh @@ -128,9 +128,57 @@ LAYOUT(regexp_tree); select dictGet('regexp_dict2', ('col_bool','col_uuid', 'col_date', 'col_datetime', 'col_array'), 'abc'); " +cat > "$yaml" < Date: Sun, 4 Jun 2023 19:48:14 -0500 Subject: [PATCH 0404/1072] Review comments: Use constexpr-if in more places Also add a comment about the apparent lack of type checking on the limit column. --- src/Functions/FunctionsExternalDictionaries.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Functions/FunctionsExternalDictionaries.h b/src/Functions/FunctionsExternalDictionaries.h index e4529ff1765..db6529da73c 100644 --- a/src/Functions/FunctionsExternalDictionaries.h +++ b/src/Functions/FunctionsExternalDictionaries.h @@ -370,7 +370,7 @@ public: } bool key_is_nullable = arguments[2].type->isNullable(); - if (dictionary_get_function_type == DictionaryGetFunctionType::getAll) + if constexpr (dictionary_get_function_type == DictionaryGetFunctionType::getAll) { if (key_is_nullable) throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Function {} does not support nullable keys", getName()); @@ -487,6 +487,7 @@ public: if (dictionary_get_function_type == DictionaryGetFunctionType::getAll && current_arguments_index < arguments.size()) { auto limit_col = arguments[current_arguments_index].column; + // The getUInt later attempts to cast and throws on a type mismatch, so skip actual type checking here if (!limit_col || !isColumnConst(*limit_col)) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of fourth argument of function {}. Expected const unsigned integer.", @@ -609,7 +610,7 @@ private: const auto & result_tuple_type = assert_cast(*result_type); Columns result_columns; - if (dictionary_get_function_type == DictionaryGetFunctionType::getAll) + if constexpr (dictionary_get_function_type == DictionaryGetFunctionType::getAll) { result_columns = dictionary->getColumnsAllValues( attribute_names, result_tuple_type.getElements(), key_columns, key_types, default_cols, collect_values_limit); @@ -624,7 +625,7 @@ private: } else { - if (dictionary_get_function_type == DictionaryGetFunctionType::getAll) + if constexpr (dictionary_get_function_type == DictionaryGetFunctionType::getAll) { result = dictionary->getColumnAllValues( attribute_names[0], result_type, key_columns, key_types, default_cols.front(), collect_values_limit); From 5e1c93c9c819a1d5819ab742fe4981199c621462 Mon Sep 17 00:00:00 2001 From: johanngan Date: Sun, 4 Jun 2023 21:09:41 -0500 Subject: [PATCH 0405/1072] Add dictGetAll to spell-check dictionary --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 0455556ae96..d6cef1883f4 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1342,6 +1342,7 @@ detectLanguageUnknown determinator deterministically dictGet +dictGetAll dictGetChildren dictGetDescendant dictGetHierarchy From f1058d2d9d2201f21882b487499ea4f4212fec0b Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 5 Jun 2023 09:51:16 +0300 Subject: [PATCH 0406/1072] Revert "Disable skim (Rust library) under memory sanitizer" --- rust/skim/CMakeLists.txt | 5 ----- 1 file changed, 5 deletions(-) diff --git a/rust/skim/CMakeLists.txt b/rust/skim/CMakeLists.txt index c2e406ec12f..1e7a43aba7c 100644 --- a/rust/skim/CMakeLists.txt +++ b/rust/skim/CMakeLists.txt @@ -14,11 +14,6 @@ if (OS_FREEBSD) return() endif() -if (SANITIZE STREQUAL "memory") - message(STATUS "skim is disabled under memory sanitizer, because the interop is not instrumented properly") - return() -endif() - clickhouse_import_crate(MANIFEST_PATH Cargo.toml) # -Wno-dollar-in-identifier-extension: cxx bridge complies names with '$' From c860db0fb77ad247e39dec10b3419ab1cb2b05e3 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Mon, 5 Jun 2023 10:32:46 +0300 Subject: [PATCH 0407/1072] Fixed tests --- src/Functions/if.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index 8d43b3a4ca3..65e2212e894 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -1124,6 +1124,9 @@ public: return {}; const ColumnConst * cond_const_col = checkAndGetColumnConst>(arg_cond.column.get()); + if (!cond_const_col) + return {}; + bool condition_value = cond_const_col->getValue(); const ColumnWithTypeAndName & arg_then = arguments[1]; From 4225cab2e8203b45e17188da28c3f6a1a330878c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Mon, 5 Jun 2023 08:34:25 +0000 Subject: [PATCH 0408/1072] Rewrite bugprone shell script command --- tests/queries/0_stateless/02771_system_user_processes.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02771_system_user_processes.sh b/tests/queries/0_stateless/02771_system_user_processes.sh index 910af4be9e2..f0e5b2a6987 100755 --- a/tests/queries/0_stateless/02771_system_user_processes.sh +++ b/tests/queries/0_stateless/02771_system_user_processes.sh @@ -7,7 +7,13 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) USER_POSTFIX=`random_str 10` USER="test_user_02771_$USER_POSTFIX" -$CLICKHOUSE_CLIENT -q "SHOW USER PROCESSES" &>"${CLICKHOUSE_TMP}/test_output" && echo "SHOW USER PROCESSES query succeeded!" || cat "${CLICKHOUSE_TMP}/test_output" +if $CLICKHOUSE_CLIENT -q "SHOW USER PROCESSES" &>"${CLICKHOUSE_TMP}/test_output" +then + echo "SHOW USER PROCESSES query succeeded!" +else + cat "${CLICKHOUSE_TMP}/test_output" +fi + $CLICKHOUSE_CLIENT -q "DROP USER IF EXISTS $USER" $CLICKHOUSE_CLIENT -q "CREATE USER $USER" $CLICKHOUSE_CLIENT -u "$USER" -q "SELECT * FROM system.numbers LIMIT 1" From 3657ef05fffa722115becb7f7e8937a3a472625d Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 19:26:41 +0000 Subject: [PATCH 0409/1072] Cosmetics: Fix indentation --- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 30 +++++++++++--------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index 3b1a41eb85d..b349c0567ef 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -14,19 +14,23 @@ namespace DB // mainly for serialization and deserialization of the index namespace ApproximateNearestNeighbour { - using AnnoyIndexThreadedBuildPolicy = ::Annoy::AnnoyIndexMultiThreadedBuildPolicy; - // TODO: Support different metrics. List of available metrics can be taken from here: - // https://github.com/spotify/annoy/blob/master/src/annoymodule.cc#L151-L171 - template - class AnnoyIndex : public ::Annoy::AnnoyIndex - { - using Base = ::Annoy::AnnoyIndex; - public: - explicit AnnoyIndex(const uint64_t dim) : Base::AnnoyIndex(dim) {} - void serialize(WriteBuffer& ostr) const; - void deserialize(ReadBuffer& istr); - uint64_t getNumOfDimensions() const; - }; + +using AnnoyIndexThreadedBuildPolicy = ::Annoy::AnnoyIndexMultiThreadedBuildPolicy; + +// TODO: Support different metrics. List of available metrics can be taken from here: +// https://github.com/spotify/annoy/blob/master/src/annoymodule.cc#L151-L171 +template +class AnnoyIndex : public ::Annoy::AnnoyIndex +{ + using Base = ::Annoy::AnnoyIndex; + +public: + explicit AnnoyIndex(const uint64_t dim) : Base::AnnoyIndex(dim) {} + void serialize(WriteBuffer& ostr) const; + void deserialize(ReadBuffer& istr); + uint64_t getNumOfDimensions() const; +}; + } template From 32756292309182bfa2ddf59213c1628a22651e26 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 19:31:34 +0000 Subject: [PATCH 0410/1072] Cosmetics: Remove parentheses in single statement if/for/while --- src/Storages/MergeTree/CommonANNIndexes.cpp | 94 ------------------- .../MergeTree/MergeTreeIndexAnnoy.cpp | 31 +----- 2 files changed, 4 insertions(+), 121 deletions(-) diff --git a/src/Storages/MergeTree/CommonANNIndexes.cpp b/src/Storages/MergeTree/CommonANNIndexes.cpp index 4b360e029e5..f0c6f256f73 100644 --- a/src/Storages/MergeTree/CommonANNIndexes.cpp +++ b/src/Storages/MergeTree/CommonANNIndexes.cpp @@ -35,17 +35,11 @@ void extractTargetVectorFromLiteral(ANN::ANNQueryInformation::Embedding & target for (const auto & value : literal.value()) { if (value.tryGet(float_element_of_target_vector)) - { target.emplace_back(float_element_of_target_vector); - } else if (value.tryGet(int_element_of_target_vector)) - { target.emplace_back(static_cast(int_element_of_target_vector)); - } else - { throw Exception(ErrorCodes::INCORRECT_QUERY, "Wrong type of elements in target vector. Only float or int are supported."); - } } } @@ -74,9 +68,7 @@ ANNCondition::ANNCondition(const SelectQueryInfo & query_info, bool ANNCondition::alwaysUnknownOrTrue(String metric_name) const { if (!index_is_useful) - { return true; // Query isn't supported - } // If query is supported, check metrics for match return !(castMetricFromStringToType(metric_name) == query_information->metric); } @@ -85,72 +77,56 @@ float ANNCondition::getComparisonDistanceForWhereQuery() const { if (index_is_useful && query_information.has_value() && query_information->query_type == ANNQueryInformation::Type::Where) - { return query_information->distance; - } throw Exception(ErrorCodes::LOGICAL_ERROR, "Not supported method for this query type"); } UInt64 ANNCondition::getLimit() const { if (index_is_useful && query_information.has_value()) - { return query_information->limit; - } throw Exception(ErrorCodes::LOGICAL_ERROR, "No LIMIT section in query, not supported"); } std::vector ANNCondition::getTargetVector() const { if (index_is_useful && query_information.has_value()) - { return query_information->target; - } throw Exception(ErrorCodes::LOGICAL_ERROR, "Target vector was requested for useless or uninitialized index."); } size_t ANNCondition::getNumOfDimensions() const { if (index_is_useful && query_information.has_value()) - { return query_information->target.size(); - } throw Exception(ErrorCodes::LOGICAL_ERROR, "Number of dimensions was requested for useless or uninitialized index."); } String ANNCondition::getColumnName() const { if (index_is_useful && query_information.has_value()) - { return query_information->column_name; - } throw Exception(ErrorCodes::LOGICAL_ERROR, "Column name was requested for useless or uninitialized index."); } ANNQueryInformation::Metric ANNCondition::getMetricType() const { if (index_is_useful && query_information.has_value()) - { return query_information->metric; - } throw Exception(ErrorCodes::LOGICAL_ERROR, "Metric name was requested for useless or uninitialized index."); } float ANNCondition::getPValueForLpDistance() const { if (index_is_useful && query_information.has_value()) - { return query_information->p_for_lp_dist; - } throw Exception(ErrorCodes::LOGICAL_ERROR, "P from LPDistance was requested for useless or uninitialized index."); } ANNQueryInformation::Type ANNCondition::getQueryType() const { if (index_is_useful && query_information.has_value()) - { return query_information->query_type; - } throw Exception(ErrorCodes::LOGICAL_ERROR, "Query type was requested for useless or uninitialized index."); } @@ -171,24 +147,16 @@ bool ANNCondition::checkQueryStructure(const SelectQueryInfo & query) const auto & select = query.query->as(); if (select.prewhere()) // If query has PREWHERE clause - { traverseAST(select.prewhere(), rpn_prewhere_clause); - } if (select.where()) // If query has WHERE clause - { traverseAST(select.where(), rpn_where_clause); - } if (select.limitLength()) // If query has LIMIT clause - { traverseAtomAST(select.limitLength(), rpn_limit); - } if (select.orderBy()) // If query has ORDERBY clause - { traverseOrderByAST(select.orderBy(), rpn_order_by_clause); - } // Reverse RPNs for conveniences during parsing std::reverse(rpn_prewhere_clause.begin(), rpn_prewhere_clause.end()); @@ -203,29 +171,21 @@ bool ANNCondition::checkQueryStructure(const SelectQueryInfo & query) // Query without a LIMIT clause or with a limit greater than a restriction is not supported if (!limit_is_valid || limit_restriction < limit) - { return false; - } // Search type query in both sections isn't supported if (prewhere_is_valid && where_is_valid) - { return false; - } // Search type should be in WHERE or PREWHERE clause if (prewhere_is_valid || where_is_valid) - { query_information = std::move(prewhere_is_valid ? prewhere_info : where_info); - } if (order_by_is_valid) { // Query with valid where and order by type is not supported if (query_information.has_value()) - { return false; - } query_information = std::move(order_by_info); } @@ -244,17 +204,13 @@ void ANNCondition::traverseAST(const ASTPtr & node, RPN & rpn) const ASTs & children = func->arguments->children; // Traverse children nodes for (const auto& child : children) - { traverseAST(child, rpn); - } } RPNElement element; // Get the data behind node if (!traverseAtomAST(node, element)) - { element.function = RPNElement::FUNCTION_UNKNOWN; - } rpn.emplace_back(std::move(element)); } @@ -273,32 +229,20 @@ bool ANNCondition::traverseAtomAST(const ASTPtr & node, RPNElement & out) function->name == "cosineDistance" || function->name == "dotProduct" || function->name == "LpDistance") - { out.function = RPNElement::FUNCTION_DISTANCE; - } else if (function->name == "tuple") - { out.function = RPNElement::FUNCTION_TUPLE; - } else if (function->name == "array") - { out.function = RPNElement::FUNCTION_ARRAY; - } else if (function->name == "less" || function->name == "greater" || function->name == "lessOrEquals" || function->name == "greaterOrEquals") - { out.function = RPNElement::FUNCTION_COMPARISON; - } else if (function->name == "_CAST") - { out.function = RPNElement::FUNCTION_CAST; - } else - { return false; - } return true; } @@ -378,12 +322,8 @@ bool ANNCondition::tryCastToConstType(const ASTPtr & node, RPNElement & out) void ANNCondition::traverseOrderByAST(const ASTPtr & node, RPN & rpn) { if (const auto * expr_list = node->as()) - { if (const auto * order_by_element = expr_list->children.front()->as()) - { traverseAST(order_by_element->children.front(), rpn); - } - } } // Returns true and stores ANNQueryInformation if the query has valid WHERE clause @@ -395,17 +335,13 @@ bool ANNCondition::matchRPNWhere(RPN & rpn, ANNQueryInformation & expr) // WHERE section must have at least 5 expressions // Operator->Distance(float)->DistanceFunc->Column->Tuple(Array)Func(TargetVector(floats)) if (rpn.size() < 5) - { return false; - } auto iter = rpn.begin(); // Query starts from operator less if (iter->function != RPNElement::FUNCTION_COMPARISON) - { return false; - } const bool greater_case = iter->func_name == "greater" || iter->func_name == "greaterOrEquals"; const bool less_case = iter->func_name == "less" || iter->func_name == "lessOrEquals"; @@ -415,9 +351,7 @@ bool ANNCondition::matchRPNWhere(RPN & rpn, ANNQueryInformation & expr) if (less_case) { if (iter->function != RPNElement::FUNCTION_FLOAT_LITERAL) - { return false; - } expr.distance = getFloatOrIntLiteralOrPanic(iter); if (expr.distance < 0) @@ -427,22 +361,16 @@ bool ANNCondition::matchRPNWhere(RPN & rpn, ANNQueryInformation & expr) } else if (!greater_case) - { return false; - } auto end = rpn.end(); if (!matchMainParts(iter, end, expr)) - { return false; - } if (greater_case) { if (expr.target.size() < 2) - { return false; - } expr.distance = expr.target.back(); if (expr.distance < 0) throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance can't be negative. Got {}", expr.distance); @@ -461,9 +389,7 @@ bool ANNCondition::matchRPNOrderBy(RPN & rpn, ANNQueryInformation & expr) // ORDER BY clause must have at least 3 expressions if (rpn.size() < 3) - { return false; - } auto iter = rpn.begin(); auto end = rpn.end(); @@ -490,9 +416,7 @@ bool ANNCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & en // Matches DistanceFunc->[Column]->[Tuple(array)Func]->TargetVector(floats)->[Column] if (iter->function != RPNElement::FUNCTION_DISTANCE) - { return false; - } expr.metric = castMetricFromStringToType(iter->func_name); ++iter; @@ -501,9 +425,7 @@ bool ANNCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & en { if (iter->function != RPNElement::FUNCTION_FLOAT_LITERAL && iter->function != RPNElement::FUNCTION_INT_LITERAL) - { return false; - } expr.p_for_lp_dist = getFloatOrIntLiteralOrPanic(iter); ++iter; } @@ -516,9 +438,7 @@ bool ANNCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & en } if (iter->function == RPNElement::FUNCTION_TUPLE || iter->function == RPNElement::FUNCTION_ARRAY) - { ++iter; - } if (iter->function == RPNElement::FUNCTION_LITERAL_TUPLE) { @@ -539,9 +459,7 @@ bool ANNCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & en ++iter; /// Cast should be made to array or tuple if (!iter->func_name.starts_with("Array") && !iter->func_name.starts_with("Tuple")) - { return false; - } ++iter; if (iter->function == RPNElement::FUNCTION_LITERAL_TUPLE) { @@ -554,31 +472,23 @@ bool ANNCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & en ++iter; } else - { return false; - } } while (iter != end) { if (iter->function == RPNElement::FUNCTION_FLOAT_LITERAL || iter->function == RPNElement::FUNCTION_INT_LITERAL) - { expr.target.emplace_back(getFloatOrIntLiteralOrPanic(iter)); - } else if (iter->function == RPNElement::FUNCTION_IDENTIFIER) { if (identifier_found) - { return false; - } expr.column_name = std::move(iter->identifier.value()); identifier_found = true; } else - { return false; - } ++iter; } @@ -591,13 +501,9 @@ bool ANNCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & en float ANNCondition::getFloatOrIntLiteralOrPanic(const RPN::iterator& iter) { if (iter->float_literal.has_value()) - { return iter->float_literal.value(); - } if (iter->int_literal.has_value()) - { return static_cast(iter->int_literal.value()); - } throw Exception(ErrorCodes::INCORRECT_QUERY, "Wrong parsed AST in buildRPN\n"); } diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index f64d6104ac6..0b7e1f29f03 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -219,17 +219,11 @@ bool MergeTreeIndexConditionAnnoy::alwaysUnknownOrTrue() const std::vector MergeTreeIndexConditionAnnoy::getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const { if (distance_name == "L2Distance") - { return getUsefulRangesImpl<::Annoy::Euclidean>(idx_granule); - } else if (distance_name == "cosineDistance") - { return getUsefulRangesImpl<::Annoy::Angular>(idx_granule); - } else - { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_name); - } } @@ -297,26 +291,18 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI MergeTreeIndexGranulePtr MergeTreeIndexAnnoy::createIndexGranule() const { if (distance_name == "L2Distance") - { return std::make_shared >(index.name, index.sample_block); - } - if (distance_name == "cosineDistance") - { + else if (distance_name == "cosineDistance") return std::make_shared >(index.name, index.sample_block); - } throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_name); } MergeTreeIndexAggregatorPtr MergeTreeIndexAnnoy::createIndexAggregator() const { if (distance_name == "L2Distance") - { return std::make_shared >(index.name, index.sample_block, number_of_trees); - } if (distance_name == "cosineDistance") - { return std::make_shared >(index.name, index.sample_block, number_of_trees); - } throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_name); } @@ -331,16 +317,10 @@ MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index) uint64_t param = 100; String distance_name = "L2Distance"; if (!index.arguments.empty() && !index.arguments[0].tryGet(param)) - { if (!index.arguments[0].tryGet(distance_name)) - { throw Exception(ErrorCodes::INCORRECT_DATA, "Can't parse first argument"); - } - } if (index.arguments.size() > 1 && !index.arguments[1].tryGet(distance_name)) - { throw Exception(ErrorCodes::INCORRECT_DATA, "Can't parse second argument"); - } return std::make_shared(index, param, distance_name); } @@ -381,18 +361,14 @@ static void assertIndexColumnsType(const Block & header) void annoyIndexValidator(const IndexDescription & index, bool /* attach */) { if (index.arguments.size() > 2) - { throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index must not have more than two parameters"); - } + if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::UInt64 && index.arguments[0].getType() != Field::Types::String) - { throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index first argument must be UInt64 or String."); - } + if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::String) - { throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index second argument must be String."); - } if (index.column_names.size() != 1 || index.data_types.size() != 1) throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Annoy indexes must be created on a single column"); @@ -401,4 +377,5 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */) } } + #endif // ENABLE_ANNOY From edad92a7f224732c52f9d57d062e46c23396d19a Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 19:36:27 +0000 Subject: [PATCH 0411/1072] Cosmetics: Minor aesthetic fixes --- .../MergeTree/MergeTreeIndexAnnoy.cpp | 45 ++++++++++--------- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 20 +++------ 2 files changed, 30 insertions(+), 35 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 0b7e1f29f03..b31779ff71c 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -18,10 +18,10 @@ namespace DB namespace ApproximateNearestNeighbour { -template -void AnnoyIndex::serialize(WriteBuffer& ostr) const +template +void AnnoyIndex::serialize(WriteBuffer& ostr) const { - assert(Base::_built); + chassert(Base::_built); writeIntBinary(Base::_s, ostr); writeIntBinary(Base::_n_items, ostr); writeIntBinary(Base::_n_nodes, ostr); @@ -32,10 +32,10 @@ void AnnoyIndex::serialize(WriteBuffer& ostr) const ostr.write(reinterpret_cast(Base::_nodes), Base::_s * Base::_n_nodes); } -template -void AnnoyIndex::deserialize(ReadBuffer& istr) +template +void AnnoyIndex::deserialize(ReadBuffer& istr) { - assert(!Base::_built); + chassert(!Base::_built); readIntBinary(Base::_s, istr); readIntBinary(Base::_n_items, istr); readIntBinary(Base::_n_nodes, istr); @@ -54,8 +54,8 @@ void AnnoyIndex::deserialize(ReadBuffer& istr) Base::_built = true; } -template -uint64_t AnnoyIndex::getNumOfDimensions() const +template +uint64_t AnnoyIndex::getNumOfDimensions() const { return Base::get_f(); } @@ -84,16 +84,16 @@ template MergeTreeIndexGranuleAnnoy::MergeTreeIndexGranuleAnnoy( const String & index_name_, const Block & index_sample_block_, - AnnoyIndexPtr index_base_) + AnnoyIndexPtr index_) : index_name(index_name_) , index_sample_block(index_sample_block_) - , index(std::move(index_base_)) + , index(std::move(index_)) {} template void MergeTreeIndexGranuleAnnoy::serializeBinary(WriteBuffer & ostr) const { - /// number of dimensions is required in the constructor, + /// Number of dimensions is required in the index constructor, /// so it must be written and read separately from the other part writeIntBinary(index->getNumOfDimensions(), ostr); // write dimension index->serialize(ostr); @@ -123,7 +123,7 @@ MergeTreeIndexGranulePtr MergeTreeIndexAggregatorAnnoy::getGranuleAndR { // NOLINTNEXTLINE(*) index->build(static_cast(number_of_trees), /*number_of_threads=*/1); - auto granule = std::make_shared >(index_name, index_sample_block, index); + auto granule = std::make_shared>(index_name, index_sample_block, index); index = nullptr; return granule; } @@ -202,7 +202,8 @@ MergeTreeIndexConditionAnnoy::MergeTreeIndexConditionAnnoy( const SelectQueryInfo & query, ContextPtr context, const String& distance_name_) - : condition(query, context), distance_name(distance_name_) + : condition(query, context) + , distance_name(distance_name_) {} @@ -232,15 +233,16 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI { UInt64 limit = condition.getLimit(); UInt64 index_granularity = condition.getIndexGranularity(); - std::optional comp_dist = condition.getQueryType() == ApproximateNearestNeighbour::ANNQueryInformation::Type::Where ? - std::optional(condition.getComparisonDistanceForWhereQuery()) : std::nullopt; + std::optional comp_dist = condition.getQueryType() == ApproximateNearestNeighbour::ANNQueryInformation::Type::Where + ? std::optional(condition.getComparisonDistanceForWhereQuery()) + : std::nullopt; if (comp_dist && comp_dist.value() < 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to optimize query with where without distance"); std::vector target_vec = condition.getTargetVector(); - auto granule = std::dynamic_pointer_cast >(idx_granule); + auto granule = std::dynamic_pointer_cast>(idx_granule); if (granule == nullptr) throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule has the wrong type"); @@ -291,18 +293,19 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI MergeTreeIndexGranulePtr MergeTreeIndexAnnoy::createIndexGranule() const { if (distance_name == "L2Distance") - return std::make_shared >(index.name, index.sample_block); + return std::make_shared>(index.name, index.sample_block); else if (distance_name == "cosineDistance") - return std::make_shared >(index.name, index.sample_block); + return std::make_shared>(index.name, index.sample_block); throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_name); } MergeTreeIndexAggregatorPtr MergeTreeIndexAnnoy::createIndexAggregator() const { + /// TODO: Support more metrics. Available metrics: https://github.com/spotify/annoy/blob/master/src/annoymodule.cc#L151-L171 if (distance_name == "L2Distance") - return std::make_shared >(index.name, index.sample_block, number_of_trees); + return std::make_shared>(index.name, index.sample_block, number_of_trees); if (distance_name == "cosineDistance") - return std::make_shared >(index.name, index.sample_block, number_of_trees); + return std::make_shared>(index.name, index.sample_block, number_of_trees); throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_name); } @@ -378,4 +381,4 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */) } -#endif // ENABLE_ANNOY +#endif diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index b349c0567ef..d591187fc64 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -15,17 +15,13 @@ namespace DB namespace ApproximateNearestNeighbour { -using AnnoyIndexThreadedBuildPolicy = ::Annoy::AnnoyIndexMultiThreadedBuildPolicy; - -// TODO: Support different metrics. List of available metrics can be taken from here: -// https://github.com/spotify/annoy/blob/master/src/annoymodule.cc#L151-L171 template -class AnnoyIndex : public ::Annoy::AnnoyIndex +class AnnoyIndex : public ::Annoy::AnnoyIndex { - using Base = ::Annoy::AnnoyIndex; + using Base = ::Annoy::AnnoyIndex; public: - explicit AnnoyIndex(const uint64_t dim) : Base::AnnoyIndex(dim) {} + explicit AnnoyIndex(uint64_t dim) : Base::AnnoyIndex(dim) {} void serialize(WriteBuffer& ostr) const; void deserialize(ReadBuffer& istr); uint64_t getNumOfDimensions() const; @@ -40,10 +36,7 @@ struct MergeTreeIndexGranuleAnnoy final : public IMergeTreeIndexGranule using AnnoyIndexPtr = std::shared_ptr; MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_); - MergeTreeIndexGranuleAnnoy( - const String & index_name_, - const Block & index_sample_block_, - AnnoyIndexPtr index_base_); + MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_, AnnoyIndexPtr index_); ~MergeTreeIndexGranuleAnnoy() override = default; @@ -118,8 +111,7 @@ public: MergeTreeIndexGranulePtr createIndexGranule() const override; MergeTreeIndexAggregatorPtr createIndexAggregator() const override; - MergeTreeIndexConditionPtr createIndexCondition( - const SelectQueryInfo & query, ContextPtr context) const override; + MergeTreeIndexConditionPtr createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const override; bool mayBenefitFromIndexForIn(const ASTPtr & /*node*/) const override { return false; } @@ -131,4 +123,4 @@ private: } -#endif // ENABLE_ANNOY +#endif From 1018677f464756f0fcb4a1c26e7143e07d30dfe7 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 19:39:04 +0000 Subject: [PATCH 0412/1072] Cosmetics: Move ctors into cpp file --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 12 ++++++++++++ src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 8 ++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index b31779ff71c..e82f279c412 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -18,6 +18,12 @@ namespace DB namespace ApproximateNearestNeighbour { +template +AnnoyIndex::AnnoyIndex(uint64_t dim) + : Base::AnnoyIndex(dim) +{ +} + template void AnnoyIndex::serialize(WriteBuffer& ostr) const { @@ -290,6 +296,12 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI return result_vector; } +MergeTreeIndexAnnoy::MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t number_of_trees_, const String& distance_name_) + : IMergeTreeIndex(index_) + , number_of_trees(number_of_trees_) + , distance_name(distance_name_) +{} + MergeTreeIndexGranulePtr MergeTreeIndexAnnoy::createIndexGranule() const { if (distance_name == "L2Distance") diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index d591187fc64..2ccb3527b18 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -21,7 +21,7 @@ class AnnoyIndex : public ::Annoy::AnnoyIndex; public: - explicit AnnoyIndex(uint64_t dim) : Base::AnnoyIndex(dim) {} + explicit AnnoyIndex(uint64_t dim); void serialize(WriteBuffer& ostr) const; void deserialize(ReadBuffer& istr); uint64_t getNumOfDimensions() const; @@ -100,11 +100,7 @@ class MergeTreeIndexAnnoy : public IMergeTreeIndex { public: - MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t number_of_trees_, const String& distance_name_) - : IMergeTreeIndex(index_) - , number_of_trees(number_of_trees_) - , distance_name(distance_name_) - {} + MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t number_of_trees_, const String& distance_name_); ~MergeTreeIndexAnnoy() override = default; From 4631595cf658ae007e70b563a4631b2a95fd2439 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 19:44:20 +0000 Subject: [PATCH 0413/1072] Cosmetics: number_of_trees --> trees --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 14 +++++++------- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index e82f279c412..946889066b1 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -118,17 +118,17 @@ template MergeTreeIndexAggregatorAnnoy::MergeTreeIndexAggregatorAnnoy( const String & index_name_, const Block & index_sample_block_, - uint64_t number_of_trees_) + uint64_t trees_) : index_name(index_name_) , index_sample_block(index_sample_block_) - , number_of_trees(number_of_trees_) + , trees(trees_) {} template MergeTreeIndexGranulePtr MergeTreeIndexAggregatorAnnoy::getGranuleAndReset() { // NOLINTNEXTLINE(*) - index->build(static_cast(number_of_trees), /*number_of_threads=*/1); + index->build(static_cast(trees), /*number_of_threads=*/1); auto granule = std::make_shared>(index_name, index_sample_block, index); index = nullptr; return granule; @@ -296,9 +296,9 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI return result_vector; } -MergeTreeIndexAnnoy::MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t number_of_trees_, const String& distance_name_) +MergeTreeIndexAnnoy::MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t trees_, const String& distance_name_) : IMergeTreeIndex(index_) - , number_of_trees(number_of_trees_) + , trees(trees_) , distance_name(distance_name_) {} @@ -315,9 +315,9 @@ MergeTreeIndexAggregatorPtr MergeTreeIndexAnnoy::createIndexAggregator() const { /// TODO: Support more metrics. Available metrics: https://github.com/spotify/annoy/blob/master/src/annoymodule.cc#L151-L171 if (distance_name == "L2Distance") - return std::make_shared>(index.name, index.sample_block, number_of_trees); + return std::make_shared>(index.name, index.sample_block, trees); if (distance_name == "cosineDistance") - return std::make_shared>(index.name, index.sample_block, number_of_trees); + return std::make_shared>(index.name, index.sample_block, trees); throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_name); } diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index 2ccb3527b18..2c41cf457ce 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -56,7 +56,7 @@ struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator using AnnoyIndex = ApproximateNearestNeighbour::AnnoyIndex; using AnnoyIndexPtr = std::shared_ptr; - MergeTreeIndexAggregatorAnnoy(const String & index_name_, const Block & index_sample_block, uint64_t number_of_trees); + MergeTreeIndexAggregatorAnnoy(const String & index_name_, const Block & index_sample_block, uint64_t trees); ~MergeTreeIndexAggregatorAnnoy() override = default; bool empty() const override { return !index || index->get_n_items() == 0; } @@ -65,7 +65,7 @@ struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator String index_name; Block index_sample_block; - const uint64_t number_of_trees; + const uint64_t trees; AnnoyIndexPtr index; }; @@ -100,7 +100,7 @@ class MergeTreeIndexAnnoy : public IMergeTreeIndex { public: - MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t number_of_trees_, const String& distance_name_); + MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t trees_, const String& distance_name_); ~MergeTreeIndexAnnoy() override = default; @@ -112,7 +112,7 @@ public: bool mayBenefitFromIndexForIn(const ASTPtr & /*node*/) const override { return false; } private: - const uint64_t number_of_trees; + const uint64_t trees; const String distance_name; }; From a8bf7af2918bf3101600cec4465c20e3a3faec0b Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 19:47:05 +0000 Subject: [PATCH 0414/1072] Cosmetics: Move ErrorCodes to top --- .../MergeTree/MergeTreeIndexAnnoy.cpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 946889066b1..54df6f46ef2 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -15,6 +15,16 @@ namespace DB { +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int INCORRECT_DATA; + extern const int INCORRECT_NUMBER_OF_COLUMNS; + extern const int INCORRECT_QUERY; + extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; +} + namespace ApproximateNearestNeighbour { @@ -69,16 +79,6 @@ uint64_t AnnoyIndex::getNumOfDimensions() const } -namespace ErrorCodes -{ - extern const int ILLEGAL_COLUMN; - extern const int INCORRECT_DATA; - extern const int INCORRECT_NUMBER_OF_COLUMNS; - extern const int INCORRECT_QUERY; - extern const int LOGICAL_ERROR; - extern const int BAD_ARGUMENTS; -} - template MergeTreeIndexGranuleAnnoy::MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_) : index_name(index_name_) From f8c1f2bd33898e2fbc1bb4a963458a781778f722 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 19:48:12 +0000 Subject: [PATCH 0415/1072] Cosmetics: Remove absolute namespace qualification of Annoy library internals --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 54df6f46ef2..716624b7453 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -226,9 +226,9 @@ bool MergeTreeIndexConditionAnnoy::alwaysUnknownOrTrue() const std::vector MergeTreeIndexConditionAnnoy::getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const { if (distance_name == "L2Distance") - return getUsefulRangesImpl<::Annoy::Euclidean>(idx_granule); + return getUsefulRangesImpl(idx_granule); else if (distance_name == "cosineDistance") - return getUsefulRangesImpl<::Annoy::Angular>(idx_granule); + return getUsefulRangesImpl(idx_granule); else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_name); } @@ -305,9 +305,9 @@ MergeTreeIndexAnnoy::MergeTreeIndexAnnoy(const IndexDescription & index_, uint64 MergeTreeIndexGranulePtr MergeTreeIndexAnnoy::createIndexGranule() const { if (distance_name == "L2Distance") - return std::make_shared>(index.name, index.sample_block); + return std::make_shared>(index.name, index.sample_block); else if (distance_name == "cosineDistance") - return std::make_shared>(index.name, index.sample_block); + return std::make_shared>(index.name, index.sample_block); throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_name); } @@ -315,9 +315,9 @@ MergeTreeIndexAggregatorPtr MergeTreeIndexAnnoy::createIndexAggregator() const { /// TODO: Support more metrics. Available metrics: https://github.com/spotify/annoy/blob/master/src/annoymodule.cc#L151-L171 if (distance_name == "L2Distance") - return std::make_shared>(index.name, index.sample_block, trees); + return std::make_shared>(index.name, index.sample_block, trees); if (distance_name == "cosineDistance") - return std::make_shared>(index.name, index.sample_block, trees); + return std::make_shared>(index.name, index.sample_block, trees); throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_name); } From e373cf682ade101bcc2fd288263e547690834ff8 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 19:50:59 +0000 Subject: [PATCH 0416/1072] Cosmetics: Unwrap Annoy index from nested namespace --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 4 ---- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 11 ++--------- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 716624b7453..cbeb6540721 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -25,8 +25,6 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -namespace ApproximateNearestNeighbour -{ template AnnoyIndex::AnnoyIndex(uint64_t dim) @@ -76,8 +74,6 @@ uint64_t AnnoyIndex::getNumOfDimensions() const return Base::get_f(); } -} - template MergeTreeIndexGranuleAnnoy::MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index 2c41cf457ce..44f5f8d8eb7 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -10,11 +10,6 @@ namespace DB { -// auxiliary namespace for working with spotify-annoy library -// mainly for serialization and deserialization of the index -namespace ApproximateNearestNeighbour -{ - template class AnnoyIndex : public ::Annoy::AnnoyIndex { @@ -27,12 +22,10 @@ public: uint64_t getNumOfDimensions() const; }; -} - template struct MergeTreeIndexGranuleAnnoy final : public IMergeTreeIndexGranule { - using AnnoyIndex = ApproximateNearestNeighbour::AnnoyIndex; + using AnnoyIndex = AnnoyIndex; using AnnoyIndexPtr = std::shared_ptr; MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_); @@ -53,7 +46,7 @@ struct MergeTreeIndexGranuleAnnoy final : public IMergeTreeIndexGranule template struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator { - using AnnoyIndex = ApproximateNearestNeighbour::AnnoyIndex; + using AnnoyIndex = AnnoyIndex; using AnnoyIndexPtr = std::shared_ptr; MergeTreeIndexAggregatorAnnoy(const String & index_name_, const Block & index_sample_block, uint64_t trees); From dc88d4e7422e9b0c19873297a9b7e2d3107ed052 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 19:53:07 +0000 Subject: [PATCH 0417/1072] Cosmetics: Factorize repeated typedefs into a single typedef --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 8 ++++---- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 15 ++++++--------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index cbeb6540721..58c59028075 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -86,7 +86,7 @@ template MergeTreeIndexGranuleAnnoy::MergeTreeIndexGranuleAnnoy( const String & index_name_, const Block & index_sample_block_, - AnnoyIndexPtr index_) + AnnoyIndexPtr index_) : index_name(index_name_) , index_sample_block(index_sample_block_) , index(std::move(index_)) @@ -106,7 +106,7 @@ void MergeTreeIndexGranuleAnnoy::deserializeBinary(ReadBuffer & istr, { uint64_t dimension; readIntBinary(dimension, istr); - index = std::make_shared(dimension); + index = std::make_shared>(dimension); index->deserialize(istr); } @@ -164,7 +164,7 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t if (offsets[i + 1] - offsets[i] != size) throw Exception(ErrorCodes::INCORRECT_DATA, "Arrays should have same length"); - index = std::make_shared(size); + index = std::make_shared>(size); index->add_item(index->get_n_items(), array.data()); /// add all rows from 1 to num_rows - 1 (this is the same as the beginning of the last element) @@ -190,7 +190,7 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t } assert(!data.empty()); if (!index) - index = std::make_shared(data[0].size()); + index = std::make_shared>(data[0].size()); for (const auto& item : data) index->add_item(index->get_n_items(), item.data()); } diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index 44f5f8d8eb7..bb0d1883fc2 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -22,14 +22,14 @@ public: uint64_t getNumOfDimensions() const; }; +template +using AnnoyIndexPtr = std::shared_ptr>; + template struct MergeTreeIndexGranuleAnnoy final : public IMergeTreeIndexGranule { - using AnnoyIndex = AnnoyIndex; - using AnnoyIndexPtr = std::shared_ptr; - MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_); - MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_, AnnoyIndexPtr index_); + MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_, AnnoyIndexPtr index_); ~MergeTreeIndexGranuleAnnoy() override = default; @@ -40,15 +40,12 @@ struct MergeTreeIndexGranuleAnnoy final : public IMergeTreeIndexGranule String index_name; Block index_sample_block; - AnnoyIndexPtr index; + AnnoyIndexPtr index; }; template struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator { - using AnnoyIndex = AnnoyIndex; - using AnnoyIndexPtr = std::shared_ptr; - MergeTreeIndexAggregatorAnnoy(const String & index_name_, const Block & index_sample_block, uint64_t trees); ~MergeTreeIndexAggregatorAnnoy() override = default; @@ -59,7 +56,7 @@ struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator String index_name; Block index_sample_block; const uint64_t trees; - AnnoyIndexPtr index; + AnnoyIndexPtr index; }; From 594572b0de48f7bd3be39f3abcb057f708b7fcf9 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 19:55:01 +0000 Subject: [PATCH 0418/1072] Cosmetics: AnnoyIndex --> AnnoyIndexWithSerialization --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 16 ++++++++-------- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 12 ++++++------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 58c59028075..60b9efcaf67 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -27,13 +27,13 @@ namespace ErrorCodes template -AnnoyIndex::AnnoyIndex(uint64_t dim) +AnnoyIndexWithSerialization::AnnoyIndexWithSerialization(uint64_t dim) : Base::AnnoyIndex(dim) { } template -void AnnoyIndex::serialize(WriteBuffer& ostr) const +void AnnoyIndexWithSerialization::serialize(WriteBuffer& ostr) const { chassert(Base::_built); writeIntBinary(Base::_s, ostr); @@ -47,7 +47,7 @@ void AnnoyIndex::serialize(WriteBuffer& ostr) const } template -void AnnoyIndex::deserialize(ReadBuffer& istr) +void AnnoyIndexWithSerialization::deserialize(ReadBuffer& istr) { chassert(!Base::_built); readIntBinary(Base::_s, istr); @@ -69,7 +69,7 @@ void AnnoyIndex::deserialize(ReadBuffer& istr) } template -uint64_t AnnoyIndex::getNumOfDimensions() const +uint64_t AnnoyIndexWithSerialization::getNumOfDimensions() const { return Base::get_f(); } @@ -86,7 +86,7 @@ template MergeTreeIndexGranuleAnnoy::MergeTreeIndexGranuleAnnoy( const String & index_name_, const Block & index_sample_block_, - AnnoyIndexPtr index_) + AnnoyIndexWithSerializationPtr index_) : index_name(index_name_) , index_sample_block(index_sample_block_) , index(std::move(index_)) @@ -106,7 +106,7 @@ void MergeTreeIndexGranuleAnnoy::deserializeBinary(ReadBuffer & istr, { uint64_t dimension; readIntBinary(dimension, istr); - index = std::make_shared>(dimension); + index = std::make_shared>(dimension); index->deserialize(istr); } @@ -164,7 +164,7 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t if (offsets[i + 1] - offsets[i] != size) throw Exception(ErrorCodes::INCORRECT_DATA, "Arrays should have same length"); - index = std::make_shared>(size); + index = std::make_shared>(size); index->add_item(index->get_n_items(), array.data()); /// add all rows from 1 to num_rows - 1 (this is the same as the beginning of the last element) @@ -190,7 +190,7 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t } assert(!data.empty()); if (!index) - index = std::make_shared>(data[0].size()); + index = std::make_shared>(data[0].size()); for (const auto& item : data) index->add_item(index->get_n_items(), item.data()); } diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index bb0d1883fc2..c5520ab5673 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -11,25 +11,25 @@ namespace DB { template -class AnnoyIndex : public ::Annoy::AnnoyIndex +class AnnoyIndexWithSerialization : public ::Annoy::AnnoyIndex { using Base = ::Annoy::AnnoyIndex; public: - explicit AnnoyIndex(uint64_t dim); + explicit AnnoyIndexWithSerialization(uint64_t dim); void serialize(WriteBuffer& ostr) const; void deserialize(ReadBuffer& istr); uint64_t getNumOfDimensions() const; }; template -using AnnoyIndexPtr = std::shared_ptr>; +using AnnoyIndexWithSerializationPtr = std::shared_ptr>; template struct MergeTreeIndexGranuleAnnoy final : public IMergeTreeIndexGranule { MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_); - MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_, AnnoyIndexPtr index_); + MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_, AnnoyIndexWithSerializationPtr index_); ~MergeTreeIndexGranuleAnnoy() override = default; @@ -40,7 +40,7 @@ struct MergeTreeIndexGranuleAnnoy final : public IMergeTreeIndexGranule String index_name; Block index_sample_block; - AnnoyIndexPtr index; + AnnoyIndexWithSerializationPtr index; }; template @@ -56,7 +56,7 @@ struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator String index_name; Block index_sample_block; const uint64_t trees; - AnnoyIndexPtr index; + AnnoyIndexWithSerializationPtr index; }; From c5ededdc5badab8fd18c6a321008157f2d1bbad5 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 20:03:51 +0000 Subject: [PATCH 0419/1072] Cosmetics: Switch arguments in MTIConditionAnnoy ctor --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 6 +++--- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 60b9efcaf67..b7346e540d2 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -202,8 +202,8 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t MergeTreeIndexConditionAnnoy::MergeTreeIndexConditionAnnoy( const IndexDescription & /*index*/, const SelectQueryInfo & query, - ContextPtr context, - const String& distance_name_) + const String& distance_name_, + ContextPtr context) : condition(query, context) , distance_name(distance_name_) {} @@ -320,7 +320,7 @@ MergeTreeIndexAggregatorPtr MergeTreeIndexAnnoy::createIndexAggregator() const MergeTreeIndexConditionPtr MergeTreeIndexAnnoy::createIndexCondition( const SelectQueryInfo & query, ContextPtr context) const { - return std::make_shared(index, query, context, distance_name); + return std::make_shared(index, query, distance_name, context); }; MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index c5520ab5673..9ebaa335542 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -66,8 +66,8 @@ public: MergeTreeIndexConditionAnnoy( const IndexDescription & index, const SelectQueryInfo & query, - ContextPtr context, - const String& distance_name); + const String& distance_name, + ContextPtr context); bool alwaysUnknownOrTrue() const override; From 3b77e4090221b9977fc98bef1e7fac5cf035eb6a Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 20:05:25 +0000 Subject: [PATCH 0420/1072] Cosmetics: Remove dots from exception messages --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index b7346e540d2..e13ea91e56c 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -345,7 +345,7 @@ static void assertIndexColumnsType(const Block & header) if (!WhichDataType(nested_type_index).isFloat32()) throw Exception( ErrorCodes::ILLEGAL_COLUMN, - "Unexpected type {} of Annoy index. Only Array(Float32) and Tuple(Float32) are supported.", + "Unexpected type {} of Annoy index. Only Array(Float32) and Tuple(Float32) are supported", column_data_type_ptr->getName()); } else if (const auto * tuple_type = typeid_cast(column_data_type_ptr.get())) @@ -357,14 +357,14 @@ static void assertIndexColumnsType(const Block & header) if (!WhichDataType(nested_type_index).isFloat32()) throw Exception( ErrorCodes::ILLEGAL_COLUMN, - "Unexpected type {} of Annoy index. Only Array(Float32) and Tuple(Float32) are supported.", + "Unexpected type {} of Annoy index. Only Array(Float32) and Tuple(Float32) are supported", column_data_type_ptr->getName()); } } else throw Exception( ErrorCodes::ILLEGAL_COLUMN, - "Unexpected type {} of Annoy index. Only Array(Float32) and Tuple(Float32) are supported.", + "Unexpected type {} of Annoy index. Only Array(Float32) and Tuple(Float32) are supported", column_data_type_ptr->getName()); } @@ -376,10 +376,10 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */) if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::UInt64 && index.arguments[0].getType() != Field::Types::String) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index first argument must be UInt64 or String."); + throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index first argument must be UInt64 or String"); if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::String) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index second argument must be String."); + throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index second argument must be String"); if (index.column_names.size() != 1 || index.data_types.size() != 1) throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Annoy indexes must be created on a single column"); From 423f69228239503b420153054a7c878b14aa2f47 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 20:07:43 +0000 Subject: [PATCH 0421/1072] Cosmetics: Remove unnecessary toString() --- src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.cpp | 2 +- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 2 +- src/Storages/MergeTree/MergeTreeIndexFullText.cpp | 2 +- src/Storages/MergeTree/MergeTreeIndexInverted.cpp | 2 +- src/Storages/MergeTree/MergeTreeIndexMinMax.cpp | 2 +- src/Storages/MergeTree/MergeTreeIndexSet.cpp | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.cpp b/src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.cpp index fe5a2a861f6..ef98accfbc6 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.cpp @@ -42,7 +42,7 @@ void MergeTreeIndexAggregatorBloomFilter::update(const Block & block, size_t * p { if (*pos >= block.rows()) throw Exception(ErrorCodes::LOGICAL_ERROR, "The provided position is not less than the number of block rows. " - "Position: {}, Block rows: {}.", toString(*pos), toString(block.rows())); + "Position: {}, Block rows: {}.", *pos, block.rows()); Block granule_index_block; size_t max_read_rows = std::min(block.rows() - *pos, limit); diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index e13ea91e56c..133c0a9a58a 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -137,7 +137,7 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t throw Exception( ErrorCodes::LOGICAL_ERROR, "The provided position is not less than the number of block rows. Position: {}, Block rows: {}.", - toString(*pos), toString(block.rows())); + *pos, block.rows()); size_t rows_read = std::min(limit, block.rows() - *pos); if (rows_read == 0) diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index 06fddd51cb8..b15bf4d6811 100644 --- a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -92,7 +92,7 @@ void MergeTreeIndexAggregatorFullText::update(const Block & block, size_t * pos, { if (*pos >= block.rows()) throw Exception(ErrorCodes::LOGICAL_ERROR, "The provided position is not less than the number of block rows. " - "Position: {}, Block rows: {}.", toString(*pos), toString(block.rows())); + "Position: {}, Block rows: {}.", *pos, block.rows()); size_t rows_read = std::min(limit, block.rows() - *pos); diff --git a/src/Storages/MergeTree/MergeTreeIndexInverted.cpp b/src/Storages/MergeTree/MergeTreeIndexInverted.cpp index baa11368c8b..e19187646cd 100644 --- a/src/Storages/MergeTree/MergeTreeIndexInverted.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexInverted.cpp @@ -123,7 +123,7 @@ void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos, { if (*pos >= block.rows()) throw Exception(ErrorCodes::LOGICAL_ERROR, "The provided position is not less than the number of block rows. " - "Position: {}, Block rows: {}.", toString(*pos), toString(block.rows())); + "Position: {}, Block rows: {}.", *pos, block.rows()); size_t rows_read = std::min(limit, block.rows() - *pos); auto row_id = store->getNextRowIDRange(rows_read); diff --git a/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp b/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp index d80f7521430..3b011837cb3 100644 --- a/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp @@ -122,7 +122,7 @@ void MergeTreeIndexAggregatorMinMax::update(const Block & block, size_t * pos, s { if (*pos >= block.rows()) throw Exception(ErrorCodes::LOGICAL_ERROR, "The provided position is not less than the number of block rows. " - "Position: {}, Block rows: {}.", toString(*pos), toString(block.rows())); + "Position: {}, Block rows: {}.", *pos, block.rows()); size_t rows_read = std::min(limit, block.rows() - *pos); diff --git a/src/Storages/MergeTree/MergeTreeIndexSet.cpp b/src/Storages/MergeTree/MergeTreeIndexSet.cpp index 901636a2de9..120b3e43472 100644 --- a/src/Storages/MergeTree/MergeTreeIndexSet.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexSet.cpp @@ -146,7 +146,7 @@ void MergeTreeIndexAggregatorSet::update(const Block & block, size_t * pos, size { if (*pos >= block.rows()) throw Exception(ErrorCodes::LOGICAL_ERROR, "The provided position is not less than the number of block rows. " - "Position: {}, Block rows: {}.", toString(*pos), toString(block.rows())); + "Position: {}, Block rows: {}.", *pos, block.rows()); size_t rows_read = std::min(limit, block.rows() - *pos); From 7608e08eed5cb80f6be097ca572827a5090a0469 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 20:09:03 +0000 Subject: [PATCH 0422/1072] Cosmetics: more constness --- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index 9ebaa335542..cde61af2891 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -38,8 +38,8 @@ struct MergeTreeIndexGranuleAnnoy final : public IMergeTreeIndexGranule bool empty() const override { return !index.get(); } - String index_name; - Block index_sample_block; + const String index_name; + const Block index_sample_block; AnnoyIndexWithSerializationPtr index; }; @@ -53,8 +53,8 @@ struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator MergeTreeIndexGranulePtr getGranuleAndReset() override; void update(const Block & block, size_t * pos, size_t limit) override; - String index_name; - Block index_sample_block; + const String index_name; + const Block index_sample_block; const uint64_t trees; AnnoyIndexWithSerializationPtr index; }; @@ -81,7 +81,7 @@ private: template std::vector getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const; - ApproximateNearestNeighbour::ANNCondition condition; + const ApproximateNearestNeighbour::ANNCondition condition; const String distance_name; }; From a3d4ede26cc57b80a74061072a8b4e7fbe2832bf Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 20:11:10 +0000 Subject: [PATCH 0423/1072] Cosmetics: Update exception messages --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 133c0a9a58a..cd094bbeeac 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -144,7 +144,7 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t return; if (index_sample_block.columns() > 1) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Only one column is supported"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected block with single column"); auto index_column_name = index_sample_block.getByPosition(0).name; const auto & column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read); @@ -162,7 +162,7 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t size_t size = offsets[0]; for (size_t i = 0; i < num_rows - 1; ++i) if (offsets[i + 1] - offsets[i] != size) - throw Exception(ErrorCodes::INCORRECT_DATA, "Arrays should have same length"); + throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name); index = std::make_shared>(size); From 2b74daaa1744588c44414f5e87578107fd0e1b84 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 20:12:29 +0000 Subject: [PATCH 0424/1072] Cosmetics: make input switch a bit more idiomatic --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index cd094bbeeac..048200eb57f 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -148,8 +148,8 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t auto index_column_name = index_sample_block.getByPosition(0).name; const auto & column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read); - const auto & column_array = typeid_cast(column_cut.get()); - if (column_array) + + if (const auto & column_array = typeid_cast(column_cut.get())) { const auto & data = column_array->getData(); const auto & array = typeid_cast(data).getData(); @@ -171,14 +171,8 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t for (size_t current_row = 1; current_row < num_rows; ++current_row) index->add_item(index->get_n_items(), &array[offsets[current_row - 1]]); } - else + else if (const auto & column_tuple = typeid_cast(column_cut.get())) { - /// Other possible type of column is Tuple - const auto & column_tuple = typeid_cast(column_cut.get()); - - if (!column_tuple) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Wrong type was given to index."); - const auto & columns = column_tuple->getColumns(); std::vector> data{column_tuple->size(), std::vector()}; @@ -194,6 +188,8 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t for (const auto& item : data) index->add_item(index->get_n_items(), item.data()); } + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array or Tuple column"); *pos += rows_read; } From 68ad903f4c9a6bc04f6847d586dd6e8ff08c4aae Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 20:13:22 +0000 Subject: [PATCH 0425/1072] Cosmetics: unglue * and & --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 048200eb57f..80c56a299be 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -149,10 +149,10 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t auto index_column_name = index_sample_block.getByPosition(0).name; const auto & column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read); - if (const auto & column_array = typeid_cast(column_cut.get())) + if (const auto & column_array = typeid_cast(column_cut.get())) { const auto & data = column_array->getData(); - const auto & array = typeid_cast(data).getData(); + const auto & array = typeid_cast(data).getData(); if (array.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Array has 0 rows, {} rows expected", rows_read); const auto & offsets = column_array->getOffsets(); @@ -171,21 +171,21 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t for (size_t current_row = 1; current_row < num_rows; ++current_row) index->add_item(index->get_n_items(), &array[offsets[current_row - 1]]); } - else if (const auto & column_tuple = typeid_cast(column_cut.get())) + else if (const auto & column_tuple = typeid_cast(column_cut.get())) { const auto & columns = column_tuple->getColumns(); std::vector> data{column_tuple->size(), std::vector()}; - for (const auto& column : columns) + for (const auto & column : columns) { - const auto& pod_array = typeid_cast(column.get())->getData(); + const auto & pod_array = typeid_cast(column.get())->getData(); for (size_t i = 0; i < pod_array.size(); ++i) data[i].push_back(pod_array[i]); } assert(!data.empty()); if (!index) index = std::make_shared>(data[0].size()); - for (const auto& item : data) + for (const auto & item : data) index->add_item(index->get_n_items(), item.data()); } else From 15c9e235c44785a8dc67544728d607271fe10436 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 20:19:44 +0000 Subject: [PATCH 0426/1072] Cosmetics: add some comments + minor changes --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 80c56a299be..ea284892754 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -140,6 +140,7 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t *pos, block.rows()); size_t rows_read = std::min(limit, block.rows() - *pos); + if (rows_read == 0) return; @@ -153,10 +154,12 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t { const auto & data = column_array->getData(); const auto & array = typeid_cast(data).getData(); + if (array.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Array has 0 rows, {} rows expected", rows_read); + const auto & offsets = column_array->getOffsets(); - size_t num_rows = offsets.size(); + const size_t num_rows = offsets.size(); /// Check all sizes are the same size_t size = offsets[0]; @@ -166,8 +169,8 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t index = std::make_shared>(size); + /// Add all rows of block index->add_item(index->get_n_items(), array.data()); - /// add all rows from 1 to num_rows - 1 (this is the same as the beginning of the last element) for (size_t current_row = 1; current_row < num_rows; ++current_row) index->add_item(index->get_n_items(), &array[offsets[current_row - 1]]); } @@ -175,6 +178,7 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t { const auto & columns = column_tuple->getColumns(); + /// TODO check if calling index->add_item() directly on the block's tuples is faster than materializing everything std::vector> data{column_tuple->size(), std::vector()}; for (const auto & column : columns) { @@ -182,9 +186,12 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t for (size_t i = 0; i < pod_array.size(); ++i) data[i].push_back(pod_array[i]); } - assert(!data.empty()); - if (!index) - index = std::make_shared>(data[0].size()); + + if (data.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Tuple has 0 rows, {} rows expected", rows_read); + + index = std::make_shared>(data[0].size()); + for (const auto & item : data) index->add_item(index->get_n_items(), item.data()); } From 828155ebefde62b610e9866a45a6d7fd71eb14b7 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 20:21:47 +0000 Subject: [PATCH 0427/1072] Cosmetics: Move assertIndexColumnsType() into annoyIndexValidator() --- .../MergeTree/MergeTreeIndexAnnoy.cpp | 36 ++++++++----------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index ea284892754..636fd384248 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -338,9 +338,22 @@ MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index) return std::make_shared(index, param, distance_name); } -static void assertIndexColumnsType(const Block & header) +void annoyIndexValidator(const IndexDescription & index, bool /* attach */) { - DataTypePtr column_data_type_ptr = header.getDataTypes()[0]; + if (index.arguments.size() > 2) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index must not have more than two parameters"); + + if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::UInt64 + && index.arguments[0].getType() != Field::Types::String) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index first argument must be UInt64 or String"); + + if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::String) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index second argument must be String"); + + if (index.column_names.size() != 1 || index.data_types.size() != 1) + throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Annoy indexes must be created on a single column"); + + DataTypePtr column_data_type_ptr = index.sample_block.getDataTypes()[0]; if (const auto * array_type = typeid_cast(column_data_type_ptr.get())) { @@ -369,25 +382,6 @@ static void assertIndexColumnsType(const Block & header) ErrorCodes::ILLEGAL_COLUMN, "Unexpected type {} of Annoy index. Only Array(Float32) and Tuple(Float32) are supported", column_data_type_ptr->getName()); - -} - -void annoyIndexValidator(const IndexDescription & index, bool /* attach */) -{ - if (index.arguments.size() > 2) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index must not have more than two parameters"); - - if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::UInt64 - && index.arguments[0].getType() != Field::Types::String) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index first argument must be UInt64 or String"); - - if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::String) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index second argument must be String"); - - if (index.column_names.size() != 1 || index.data_types.size() != 1) - throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Annoy indexes must be created on a single column"); - - assertIndexColumnsType(index.sample_block); } } From 0854d913723d967e07105857924f788817ac48c5 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 20:27:01 +0000 Subject: [PATCH 0428/1072] Cosmetics: Rename variable --- .../MergeTree/MergeTreeIndexAnnoy.cpp | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 636fd384248..d75f4978f08 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -353,35 +353,35 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */) if (index.column_names.size() != 1 || index.data_types.size() != 1) throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Annoy indexes must be created on a single column"); - DataTypePtr column_data_type_ptr = index.sample_block.getDataTypes()[0]; + DataTypePtr data_type = index.sample_block.getDataTypes()[0]; - if (const auto * array_type = typeid_cast(column_data_type_ptr.get())) + if (const auto * data_type_array = typeid_cast(data_type.get())) { - TypeIndex nested_type_index = array_type->getNestedType()->getTypeId(); + TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId(); if (!WhichDataType(nested_type_index).isFloat32()) throw Exception( ErrorCodes::ILLEGAL_COLUMN, "Unexpected type {} of Annoy index. Only Array(Float32) and Tuple(Float32) are supported", - column_data_type_ptr->getName()); + data_type->getName()); } - else if (const auto * tuple_type = typeid_cast(column_data_type_ptr.get())) + else if (const auto * data_type_tuple = typeid_cast(data_type.get())) { - const DataTypes & nested_types = tuple_type->getElements(); - for (const auto & type : nested_types) + const DataTypes & inner_types = data_type_tuple->getElements(); + for (const auto & inner_type : inner_types) { - TypeIndex nested_type_index = type->getTypeId(); + TypeIndex nested_type_index = inner_type->getTypeId(); if (!WhichDataType(nested_type_index).isFloat32()) throw Exception( ErrorCodes::ILLEGAL_COLUMN, - "Unexpected type {} of Annoy index. Only Array(Float32) and Tuple(Float32) are supported", - column_data_type_ptr->getName()); + "Unexpected inner_type {} of Annoy index. Only Array(Float32) and Tuple(Float32) are supported", + data_type->getName()); } } else throw Exception( ErrorCodes::ILLEGAL_COLUMN, "Unexpected type {} of Annoy index. Only Array(Float32) and Tuple(Float32) are supported", - column_data_type_ptr->getName()); + data_type->getName()); } } From 62c8b9a7a11c0a3aae75a10a2a0ad61db5d63b55 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 20:28:20 +0000 Subject: [PATCH 0429/1072] Cosmetics: Factorize throw into lambda --- .../MergeTree/MergeTreeIndexAnnoy.cpp | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index d75f4978f08..d35f435c391 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -353,16 +353,21 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */) if (index.column_names.size() != 1 || index.data_types.size() != 1) throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Annoy indexes must be created on a single column"); + auto throw_unsupported_underlying_column_exception = [](DataTypePtr data_type) + { + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32). Given type: {}", + data_type->getName()); + }; + DataTypePtr data_type = index.sample_block.getDataTypes()[0]; if (const auto * data_type_array = typeid_cast(data_type.get())) { TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId(); if (!WhichDataType(nested_type_index).isFloat32()) - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "Unexpected type {} of Annoy index. Only Array(Float32) and Tuple(Float32) are supported", - data_type->getName()); + throw_unsupported_underlying_column_exception(data_type); } else if (const auto * data_type_tuple = typeid_cast(data_type.get())) { @@ -371,17 +376,11 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */) { TypeIndex nested_type_index = inner_type->getTypeId(); if (!WhichDataType(nested_type_index).isFloat32()) - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "Unexpected inner_type {} of Annoy index. Only Array(Float32) and Tuple(Float32) are supported", - data_type->getName()); + throw_unsupported_underlying_column_exception(data_type); } } else - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "Unexpected type {} of Annoy index. Only Array(Float32) and Tuple(Float32) are supported", - data_type->getName()); + throw_unsupported_underlying_column_exception(data_type); } } From 5d871c7fa09637eb89b69680ba5e5d256bddbdd7 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 20:29:51 +0000 Subject: [PATCH 0430/1072] Cosmetics: +comments --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index d35f435c391..8f0cad48dc0 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -340,6 +340,8 @@ MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index) void annoyIndexValidator(const IndexDescription & index, bool /* attach */) { + /// Check number and type of Annoy index arguments: + if (index.arguments.size() > 2) throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index must not have more than two parameters"); @@ -350,9 +352,13 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */) if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::String) throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index second argument must be String"); + /// Check that the index is created on a single column + if (index.column_names.size() != 1 || index.data_types.size() != 1) throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Annoy indexes must be created on a single column"); + /// Check data type of indexed column: + auto throw_unsupported_underlying_column_exception = [](DataTypePtr data_type) { throw Exception( From f577bf35fc53262290e3c6e18352d1a446cbb642 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 20:35:38 +0000 Subject: [PATCH 0431/1072] Simplify Annoy parameterization --- .../MergeTree/MergeTreeIndexAnnoy.cpp | 25 ++++--- tests/queries/0_stateless/02354_annoy.sh | 73 +------------------ .../0_stateless/02354_annoy_index.reference | 1 + .../queries/0_stateless/02354_annoy_index.sql | 26 +++++++ 4 files changed, 42 insertions(+), 83 deletions(-) create mode 100644 tests/queries/0_stateless/02354_annoy_index.reference create mode 100644 tests/queries/0_stateless/02354_annoy_index.sql diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 8f0cad48dc0..dc353c97143 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -328,14 +328,16 @@ MergeTreeIndexConditionPtr MergeTreeIndexAnnoy::createIndexCondition( MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index) { - uint64_t param = 100; + uint64_t trees = 100; String distance_name = "L2Distance"; - if (!index.arguments.empty() && !index.arguments[0].tryGet(param)) - if (!index.arguments[0].tryGet(distance_name)) - throw Exception(ErrorCodes::INCORRECT_DATA, "Can't parse first argument"); - if (index.arguments.size() > 1 && !index.arguments[1].tryGet(distance_name)) - throw Exception(ErrorCodes::INCORRECT_DATA, "Can't parse second argument"); - return std::make_shared(index, param, distance_name); + + if (!index.arguments.empty()) + distance_name = index.arguments[0].get(); + + if (index.arguments.size() > 1) + trees = index.arguments[1].get(); + + return std::make_shared(index, trees, distance_name); } void annoyIndexValidator(const IndexDescription & index, bool /* attach */) @@ -345,12 +347,11 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */) if (index.arguments.size() > 2) throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index must not have more than two parameters"); - if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::UInt64 - && index.arguments[0].getType() != Field::Types::String) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index first argument must be UInt64 or String"); + if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::String) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance function argument of Annoy index must be of type String"); - if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::String) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index second argument must be String"); + if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::UInt64) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Number of trees argument of Annoy index must be UInt64"); /// Check that the index is created on a single column diff --git a/tests/queries/0_stateless/02354_annoy.sh b/tests/queries/0_stateless/02354_annoy.sh index 87258debf0f..1031ea81946 100755 --- a/tests/queries/0_stateless/02354_annoy.sh +++ b/tests/queries/0_stateless/02354_annoy.sh @@ -91,7 +91,7 @@ CREATE TABLE 02354_annoy_cosine ( id Int32, embedding Array(Float32), - INDEX annoy_index embedding TYPE annoy(100, 'cosineDistance') GRANULARITY 1 + INDEX annoy_index embedding TYPE annoy('cosineDistance', 100) GRANULARITY 1 ) ENGINE = MergeTree ORDER BY id @@ -120,7 +120,7 @@ CREATE TABLE 02354_annoy_cosine ( id Int32, embedding Array(Float32), - INDEX annoy_index embedding TYPE annoy(100, 'cosineDistance') GRANULARITY 1 + INDEX annoy_index embedding TYPE annoy('cosineDistance', 100) GRANULARITY 1 ) ENGINE = MergeTree ORDER BY id @@ -141,72 +141,3 @@ ORDER BY cosineDistance(embedding, [0.0, 0.0, 10.0]) LIMIT 3; DROP TABLE IF EXISTS 02354_annoy_cosine; " | grep "annoy_index" - -# # Check that weird base columns are rejected -$CLICKHOUSE_CLIENT -nm --allow_experimental_annoy_index=1 -q " -DROP TABLE IF EXISTS 02354_annoy; - --- Index spans >1 column - -CREATE TABLE 02354_annoy -( - id Int32, - embedding Array(Float32), - INDEX annoy_index (embedding, id) TYPE annoy(100) GRANULARITY 1 -) -ENGINE = MergeTree -ORDER BY id -SETTINGS index_granularity=5, index_granularity_bytes = '10Mi'; -- {serverError 7 } - --- Index must be created on Array(Float32) or Tuple(Float32) - -CREATE TABLE 02354_annoy -( - id Int32, - embedding Float32, - INDEX annoy_index embedding TYPE annoy(100) GRANULARITY 1 -) -ENGINE = MergeTree -ORDER BY id -SETTINGS index_granularity=5, index_granularity_bytes = '10Mi'; -- {serverError 44 } - - -CREATE TABLE 02354_annoy -( - id Int32, - embedding Array(Float64), - INDEX annoy_index embedding TYPE annoy(100) GRANULARITY 1 -) -ENGINE = MergeTree -ORDER BY id -SETTINGS index_granularity=5, index_granularity_bytes = '10Mi'; -- {serverError 44 } - -CREATE TABLE 02354_annoy -( - id Int32, - embedding Tuple(Float32, Float64), - INDEX annoy_index embedding TYPE annoy(100) GRANULARITY 1 -) -ENGINE = MergeTree -ORDER BY id -SETTINGS index_granularity=5, index_granularity_bytes = '10Mi'; -- {serverError 44 } - -CREATE TABLE 02354_annoy -( - id Int32, - embedding Array(LowCardinality(Float32)), - INDEX annoy_index embedding TYPE annoy(100) GRANULARITY 1 -) -ENGINE = MergeTree -ORDER BY id -SETTINGS index_granularity=5, index_granularity_bytes = '10Mi'; -- {serverError 44 } - -CREATE TABLE 02354_annoy -( - id Int32, - embedding Array(Nullable(Float32)), - INDEX annoy_index embedding TYPE annoy(100) GRANULARITY 1 -) -ENGINE = MergeTree -ORDER BY id -SETTINGS index_granularity=5, index_granularity_bytes = '10Mi'; -- {serverError 44 }" diff --git a/tests/queries/0_stateless/02354_annoy_index.reference b/tests/queries/0_stateless/02354_annoy_index.reference new file mode 100644 index 00000000000..2d162500f67 --- /dev/null +++ b/tests/queries/0_stateless/02354_annoy_index.reference @@ -0,0 +1 @@ +Negative tests diff --git a/tests/queries/0_stateless/02354_annoy_index.sql b/tests/queries/0_stateless/02354_annoy_index.sql new file mode 100644 index 00000000000..8df9af1ee73 --- /dev/null +++ b/tests/queries/0_stateless/02354_annoy_index.sql @@ -0,0 +1,26 @@ +-- Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-upgrade-check + +SET allow_experimental_annoy_index = 1; + +DROP TABLE IF EXISTS tab; + +SELECT 'Negative tests'; + +-- must have at most 2 arguments +CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy('too', 'many', 'arguments')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } + +-- first argument must be UInt64 +CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } + +-- 2nd argument must be String +CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy('L2Distance', 'not an UInt64')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } + +-- must be created on single column +CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index (embedding, id) TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS } + +-- must be created on Array/Tuple(Float32) columns +SET allow_suspicious_low_cardinality_types = 1; +CREATE TABLE tab(id Int32, embedding Float32, INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, embedding Array(Float64), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, embedding LowCardinality(Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, embedding Nullable(Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } From 03b6856556c67fe1c0f7c1df0b28b19556fd3fcc Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 20:39:57 +0000 Subject: [PATCH 0432/1072] Cosmetics: distance_name --> distance_function --- .../MergeTree/MergeTreeIndexAnnoy.cpp | 41 +++++++++---------- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 8 ++-- 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index dc353c97143..a9679453655 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -205,31 +205,31 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t MergeTreeIndexConditionAnnoy::MergeTreeIndexConditionAnnoy( const IndexDescription & /*index*/, const SelectQueryInfo & query, - const String& distance_name_, + const String & distance_function_, ContextPtr context) : condition(query, context) - , distance_name(distance_name_) + , distance_function(distance_function_) {} -bool MergeTreeIndexConditionAnnoy::mayBeTrueOnGranule(MergeTreeIndexGranulePtr /* idx_granule */) const +bool MergeTreeIndexConditionAnnoy::mayBeTrueOnGranule(MergeTreeIndexGranulePtr /*idx_granule*/) const { throw Exception(ErrorCodes::LOGICAL_ERROR, "mayBeTrueOnGranule is not supported for ANN skip indexes"); } bool MergeTreeIndexConditionAnnoy::alwaysUnknownOrTrue() const { - return condition.alwaysUnknownOrTrue(distance_name); + return condition.alwaysUnknownOrTrue(distance_function); } std::vector MergeTreeIndexConditionAnnoy::getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const { - if (distance_name == "L2Distance") + if (distance_function == "L2Distance") return getUsefulRangesImpl(idx_granule); - else if (distance_name == "cosineDistance") + else if (distance_function == "cosineDistance") return getUsefulRangesImpl(idx_granule); else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_name); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_function); } @@ -295,49 +295,48 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI return result_vector; } -MergeTreeIndexAnnoy::MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t trees_, const String& distance_name_) +MergeTreeIndexAnnoy::MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t trees_, const String & distance_function_) : IMergeTreeIndex(index_) , trees(trees_) - , distance_name(distance_name_) + , distance_function(distance_function_) {} MergeTreeIndexGranulePtr MergeTreeIndexAnnoy::createIndexGranule() const { - if (distance_name == "L2Distance") + if (distance_function == "L2Distance") return std::make_shared>(index.name, index.sample_block); - else if (distance_name == "cosineDistance") + else if (distance_function == "cosineDistance") return std::make_shared>(index.name, index.sample_block); - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_name); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_function); } MergeTreeIndexAggregatorPtr MergeTreeIndexAnnoy::createIndexAggregator() const { /// TODO: Support more metrics. Available metrics: https://github.com/spotify/annoy/blob/master/src/annoymodule.cc#L151-L171 - if (distance_name == "L2Distance") + if (distance_function == "L2Distance") return std::make_shared>(index.name, index.sample_block, trees); - if (distance_name == "cosineDistance") + if (distance_function == "cosineDistance") return std::make_shared>(index.name, index.sample_block, trees); - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_name); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_function); } -MergeTreeIndexConditionPtr MergeTreeIndexAnnoy::createIndexCondition( - const SelectQueryInfo & query, ContextPtr context) const +MergeTreeIndexConditionPtr MergeTreeIndexAnnoy::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const { - return std::make_shared(index, query, distance_name, context); + return std::make_shared(index, query, distance_function, context); }; MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index) { uint64_t trees = 100; - String distance_name = "L2Distance"; + String distance_function = "L2Distance"; if (!index.arguments.empty()) - distance_name = index.arguments[0].get(); + distance_function = index.arguments[0].get(); if (index.arguments.size() > 1) trees = index.arguments[1].get(); - return std::make_shared(index, trees, distance_name); + return std::make_shared(index, trees, distance_function); } void annoyIndexValidator(const IndexDescription & index, bool /* attach */) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index cde61af2891..9741412e3fa 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -66,7 +66,7 @@ public: MergeTreeIndexConditionAnnoy( const IndexDescription & index, const SelectQueryInfo & query, - const String& distance_name, + const String& distance_function, ContextPtr context); bool alwaysUnknownOrTrue() const override; @@ -82,7 +82,7 @@ private: std::vector getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const; const ApproximateNearestNeighbour::ANNCondition condition; - const String distance_name; + const String distance_function; }; @@ -90,7 +90,7 @@ class MergeTreeIndexAnnoy : public IMergeTreeIndex { public: - MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t trees_, const String& distance_name_); + MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t trees_, const String & distance_function_); ~MergeTreeIndexAnnoy() override = default; @@ -103,7 +103,7 @@ public: private: const uint64_t trees; - const String distance_name; + const String distance_function; }; From 18304f5aeff627b47965ffd5c07ae6c5d61bface Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 20:51:21 +0000 Subject: [PATCH 0433/1072] Check distance function in CREATE TABLE instead of first INSERT --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 13 +++++++++++-- tests/queries/0_stateless/02354_annoy_index.sql | 3 +++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index a9679453655..12446623c30 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -307,7 +307,7 @@ MergeTreeIndexGranulePtr MergeTreeIndexAnnoy::createIndexGranule() const return std::make_shared>(index.name, index.sample_block); else if (distance_function == "cosineDistance") return std::make_shared>(index.name, index.sample_block); - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_function); + std::unreachable(); } MergeTreeIndexAggregatorPtr MergeTreeIndexAnnoy::createIndexAggregator() const @@ -317,7 +317,7 @@ MergeTreeIndexAggregatorPtr MergeTreeIndexAnnoy::createIndexAggregator() const return std::make_shared>(index.name, index.sample_block, trees); if (distance_function == "cosineDistance") return std::make_shared>(index.name, index.sample_block, trees); - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_function); + std::unreachable(); } MergeTreeIndexConditionPtr MergeTreeIndexAnnoy::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const @@ -357,6 +357,15 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */) if (index.column_names.size() != 1 || index.data_types.size() != 1) throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Annoy indexes must be created on a single column"); + /// Check that a supported metric was passed as first argument + + if (!index.arguments.empty()) + { + String distance_name = index.arguments[0].get(); + if (distance_name != "L2Distance" && distance_name != "cosineDistance") + throw Exception(ErrorCodes::INCORRECT_DATA, "Annoy index supports only distance functions 'L2Distance' and 'cosineDistance'. Given distance function: {}", distance_name); + } + /// Check data type of indexed column: auto throw_unsupported_underlying_column_exception = [](DataTypePtr data_type) diff --git a/tests/queries/0_stateless/02354_annoy_index.sql b/tests/queries/0_stateless/02354_annoy_index.sql index 8df9af1ee73..3a5fb6817ff 100644 --- a/tests/queries/0_stateless/02354_annoy_index.sql +++ b/tests/queries/0_stateless/02354_annoy_index.sql @@ -24,3 +24,6 @@ CREATE TABLE tab(id Int32, embedding Float32, INDEX annoy_index embedding TYPE a CREATE TABLE tab(id Int32, embedding Array(Float64), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } CREATE TABLE tab(id Int32, embedding LowCardinality(Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } CREATE TABLE tab(id Int32, embedding Nullable(Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } + +-- reject unsupported distance functions +CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy('wormholeDistance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } From d3158a28770339f36f9c0e69acb6d271ced351f6 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 20:56:27 +0000 Subject: [PATCH 0434/1072] Cosmetics: Consolidate parameters --- src/Core/Settings.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 6a0833aef60..8055e9b4880 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -719,7 +719,6 @@ class IColumn; \ M(Bool, parallelize_output_from_storages, true, "Parallelize output for reading step from storage. It allows parallelizing query processing right after reading from storage if possible", 0) \ M(String, insert_deduplication_token, "", "If not empty, used for duplicate detection instead of data digest", 0) \ - M(String, ann_index_select_query_params, "", "Parameters passed to ANN indexes in SELECT queries, the format is 'param1=x, param2=y, ...'", 0) \ M(Bool, count_distinct_optimization, false, "Rewrite count distinct to subquery of group by", 0) \ M(Bool, throw_if_no_data_to_insert, true, "Enables or disables empty INSERTs, enabled by default", 0) \ M(Bool, compatibility_ignore_auto_increment_in_create_table, false, "Ignore AUTO_INCREMENT keyword in column declaration if true, otherwise return error. It simplifies migration from MySQL", 0) \ @@ -742,7 +741,8 @@ class IColumn; M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions (hashid, etc)", 0) \ M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \ - M(UInt64, max_limit_for_ann_queries, 1000000, "Maximum limit value for using ANN indexes is used to prevent memory overflow in search queries for indexes", 0) \ + M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexs.", 0) \ + M(String, ann_index_select_query_params, "", "Parameters passed to ANN indexes in SELECT queries, the format is 'param1=x, param2=y, ...'", 0) \ M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \ M(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, "Wait for committed changes to become actually visible in the latest snapshot", 0) \ M(Bool, implicit_transaction, false, "If enabled and not already inside a transaction, wraps the query inside a full transaction (begin + commit or rollback)", 0) \ From a973ac5dbb99e7fc624742b34fc507935cd792e4 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 21:03:50 +0000 Subject: [PATCH 0435/1072] Replace weird generic ANN setting by Annoy-specific parameter --- .../mergetree-family/annindexes.md | 3 ++- src/Core/Settings.h | 2 +- src/Storages/MergeTree/CommonANNIndexes.cpp | 1 - src/Storages/MergeTree/CommonANNIndexes.h | 7 ------ .../MergeTree/MergeTreeIndexAnnoy.cpp | 18 +++----------- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 5 ++-- .../0_stateless/02354_annoy_index.reference | 12 ++++++++++ .../queries/0_stateless/02354_annoy_index.sql | 24 +++++++++++++++++++ 8 files changed, 45 insertions(+), 27 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 03617a1a709..9b4de150235 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -135,7 +135,8 @@ ORDER BY id; Annoy supports `L2Distance` and `cosineDistance`. -In the `SELECT` in the settings (`ann_index_select_query_params`) you can specify the size of the internal buffer (more details in the description above or in the [original repository](https://github.com/spotify/annoy)). During the query it will inspect up to `search_k` nodes which defaults to `n_trees * n` if not provided. `search_k` gives you a run-time trade-off between better accuracy and speed. +Setting `search_k` (default `LIMIT * NumTrees`) determines how many nodes the Annoy index will inspect during SELECT queries. The setting +can be used to balance performance and accuracy at runtime. __Example__: ``` sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 8055e9b4880..3e10f48a2fb 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -742,7 +742,7 @@ class IColumn; M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \ M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexs.", 0) \ - M(String, ann_index_select_query_params, "", "Parameters passed to ANN indexes in SELECT queries, the format is 'param1=x, param2=y, ...'", 0) \ + M(Int64, annoy_index_search_k_nodes, -1, "SELECT queries search up to this many nodes in Annoy indexes.", 0) \ M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \ M(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, "Wait for committed changes to become actually visible in the latest snapshot", 0) \ M(Bool, implicit_transaction, false, "If enabled and not already inside a transaction, wraps the query inside a full transaction (begin + commit or rollback)", 0) \ diff --git a/src/Storages/MergeTree/CommonANNIndexes.cpp b/src/Storages/MergeTree/CommonANNIndexes.cpp index f0c6f256f73..4748c869f83 100644 --- a/src/Storages/MergeTree/CommonANNIndexes.cpp +++ b/src/Storages/MergeTree/CommonANNIndexes.cpp @@ -60,7 +60,6 @@ namespace ApproximateNearestNeighbour ANNCondition::ANNCondition(const SelectQueryInfo & query_info, ContextPtr context) : block_with_constants{KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)}, - ann_index_select_query_params{context->getSettings().get("ann_index_select_query_params").get()}, index_granularity{context->getMergeTreeSettings().get("index_granularity").get()}, limit_restriction{context->getSettings().get("max_limit_for_ann_queries").get()}, index_is_useful{checkQueryStructure(query_info)} {} diff --git a/src/Storages/MergeTree/CommonANNIndexes.h b/src/Storages/MergeTree/CommonANNIndexes.h index fefb9584863..4253bce703a 100644 --- a/src/Storages/MergeTree/CommonANNIndexes.h +++ b/src/Storages/MergeTree/CommonANNIndexes.h @@ -82,8 +82,6 @@ struct ANNQueryInformation * spaceDimension(which is targetVector's components count) * column * objects count from LIMIT clause(for both queries) - * settings str, if query has settings section with new 'ann_index_select_query_params' value, - than you can get the new value(empty by default) calling method getSettingsStr * queryHasOrderByClause and queryHasWhereClause return true if query matches the type Search query type is also recognized for PREWHERE clause @@ -121,11 +119,7 @@ public: // length's value from LIMIT clause UInt64 getLimit() const; - // value of 'ann_index_select_query_params' if have in SETTINGS clause, empty string otherwise - String getParamsStr() const { return ann_index_select_query_params; } - private: - struct RPNElement { enum Function @@ -217,7 +211,6 @@ private: std::optional query_information; // Get from settings ANNIndex parameters - String ann_index_select_query_params; UInt64 index_granularity; /// only queries with a lower limit can be considered to avoid memory overflow UInt64 limit_restriction; diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 12446623c30..6ffb7aecb7f 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -209,6 +210,7 @@ MergeTreeIndexConditionAnnoy::MergeTreeIndexConditionAnnoy( ContextPtr context) : condition(query, context) , distance_function(distance_function_) + , search_k(context->getSettings().get("annoy_index_search_k_nodes").get()) {} @@ -264,21 +266,7 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI neighbors.reserve(limit); distances.reserve(limit); - int k_search = -1; - String params_str = condition.getParamsStr(); - if (!params_str.empty()) - { - try - { - /// k_search=... (algorithm will inspect up to search_k nodes which defaults to n_trees * n if not provided) - k_search = std::stoi(params_str.data() + 9); - } - catch (...) - { - throw Exception(ErrorCodes::INCORRECT_QUERY, "Setting of the annoy index should be int"); - } - } - annoy->get_nns_by_vector(target_vec.data(), limit, k_search, &neighbors, &distances); + annoy->get_nns_by_vector(target_vec.data(), limit, static_cast(search_k), &neighbors, &distances); std::unordered_set granule_numbers; for (size_t i = 0; i < neighbors.size(); ++i) { diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index 9741412e3fa..fbc6b21fa6b 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -69,20 +69,21 @@ public: const String& distance_function, ContextPtr context); + ~MergeTreeIndexConditionAnnoy() override = default; + bool alwaysUnknownOrTrue() const override; bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const override; std::vector getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const override; - ~MergeTreeIndexConditionAnnoy() override = default; - private: template std::vector getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const; const ApproximateNearestNeighbour::ANNCondition condition; const String distance_function; + const Int64 search_k; }; diff --git a/tests/queries/0_stateless/02354_annoy_index.reference b/tests/queries/0_stateless/02354_annoy_index.reference index 2d162500f67..7da442cb905 100644 --- a/tests/queries/0_stateless/02354_annoy_index.reference +++ b/tests/queries/0_stateless/02354_annoy_index.reference @@ -1 +1,13 @@ +parameter annoy_index_search_k_nodes +parameter max_limit_for_ann_queries +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 Negative tests diff --git a/tests/queries/0_stateless/02354_annoy_index.sql b/tests/queries/0_stateless/02354_annoy_index.sql index 3a5fb6817ff..3590b7d316e 100644 --- a/tests/queries/0_stateless/02354_annoy_index.sql +++ b/tests/queries/0_stateless/02354_annoy_index.sql @@ -4,6 +4,30 @@ SET allow_experimental_annoy_index = 1; DROP TABLE IF EXISTS tab; +DROP TABLE IF EXISTS tab; +CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; +-- SETTINGS index_granularity=5, index_granularity_bytes = '10Mi'; +INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]); + +SELECT 'parameter annoy_index_search_k_nodes'; +SELECT * +FROM tab +ORDER BY L2Distance(embedding, [5.3, 7.3, 2.1]) +LIMIT 5 +SETTINGS annoy_index_search_k_nodes=0; -- searches zero nodes --> no results + +SELECT 'parameter max_limit_for_ann_queries'; +EXPLAIN indexes=1 +SELECT * +FROM tab +ORDER BY L2Distance(embedding, [5.3, 7.3, 2.1]) +LIMIT 5 +SETTINGS max_limit_for_ann_queries=2; -- doesn't use the ann index + +DROP TABLE tab; + +DROP TABLE IF EXISTS tab; + SELECT 'Negative tests'; -- must have at most 2 arguments From 6d3431d2ff0325282ea373bfeafed6a2e1946577 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 21:12:40 +0000 Subject: [PATCH 0436/1072] Cosmetics: Sort includes --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 6ffb7aecb7f..fe3ea322b91 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -2,15 +2,15 @@ #include +#include #include #include -#include -#include -#include -#include -#include #include #include +#include +#include +#include +#include namespace DB From f800940639bbe490e619594db62a83f7f8f2f80d Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 21:13:33 +0000 Subject: [PATCH 0437/1072] Cosmetics: Shuffle statements --- src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 16b27c2c820..aa340d6afc1 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1686,7 +1686,6 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex( { if (index_mark != index_range.begin || !granule || last_index_mark != index_range.begin) granule = reader.read(); - const auto * gin_filter_condition = dynamic_cast(&*condition); // Cast to Ann condition auto ann_condition = std::dynamic_pointer_cast(condition); if (ann_condition != nullptr) @@ -1714,6 +1713,7 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex( } bool result = false; + const auto * gin_filter_condition = dynamic_cast(&*condition); if (!gin_filter_condition) result = condition->mayBeTrueOnGranule(granule); else From 660760782ad6357c34f55113f3a9522e94b30dd3 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 21:16:03 +0000 Subject: [PATCH 0438/1072] Rewrite ANN docs --- .../mergetree-family/annindexes.md | 140 ++++++++++-------- 1 file changed, 78 insertions(+), 62 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 9b4de150235..0cc1cff2dad 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -1,78 +1,89 @@ # Approximate Nearest Neighbor Search Indexes [experimental] {#table_engines-ANNIndex} -The main task that indexes achieve is to quickly find nearest neighbors for multidimensional data. An example of such a problem can be finding similar pictures (texts) for a given picture (text). That problem can be reduced to finding the nearest [embeddings](https://cloud.google.com/architecture/overview-extracting-and-serving-feature-embeddings-for-machine-learning). They can be created from data using [UDF](/docs/en/sql-reference/functions/index.md/#executable-user-defined-functions). +Nearest neighborhood search refers to the problem of finding the point(s) with the smallest distance to a given point in an n-dimensional +space. Since exact search is in practice usually typically too slow, the task is often solved with approximate algorithms. A popular use +case of of neighbor search is finding similar pictures (texts) for a given picture (text). Pictures (texts) can be decomposed into +[embeddings](https://cloud.google.com/architecture/overview-extracting-and-serving-feature-embeddings-for-machine-learning), and instead of +comparing pictures (texts) pixel-by-pixel (character-by-character), only the embeddings are compared. -The next queries find the closest neighbors in N-dimensional space using the L2 (Euclidean) distance: -``` sql -SELECT * -FROM table_name -WHERE L2Distance(Column, Point) < MaxDistance +In terms of SQL, the problem can be expressed as follows: + +``` sql +SELECT * +FROM table +WHERE L2Distance(column, Point) < MaxDistance LIMIT N ``` -``` sql -SELECT * -FROM table_name -ORDER BY L2Distance(Column, Point) +``` sql +SELECT * +FROM table +ORDER BY L2Distance(column, Point) LIMIT N ``` -But it will take some time for execution because of the long calculation of the distance between `TargetEmbedding` and all other vectors. This is where ANN indexes can help. They store a compact approximation of the search space (e.g. using clustering, search trees, etc.) and are able to compute approximate neighbors quickly. + +The queries are expensive because the L2 distance (Euclidean distance) between all points in `column` and `Point` must be computed. To speed this process up, ANN indexes store a compact representation of the search space (using clustering, search trees, etc.) which allows to compute an approximate answer quickly. ## Indexes Structure -Approximate Nearest Neighbor Search Indexes (`ANNIndexes`) are similar to skip indexes. They are constructed by some granules and determine which of them should be skipped. Compared to skip indices, ANN indices use their results not only to skip some group of granules, but also to select particular granules from a set of granules. +Approximate Nearest Neighbor Search Indexes (or `ANNIndexes`) are similar to skip indexes. They are constructed over granules and determine which granules can be skipped. Compared to skip indices, ANN indices are not only able to skip granules, they can also to select particular granules from a set of granules. -`ANNIndexes` are designed to speed up two types of queries: +`ANNIndexes` support two types of queries: -- ###### Type 1: Where - ``` sql - SELECT * - FROM table_name - WHERE DistanceFunction(Column, Point) < MaxDistance +- WHERE queries: + ``` sql + SELECT * + FROM table + WHERE DistanceFunction(column, Point) < MaxDistance LIMIT N ``` -- ###### Type 2: Order by + +- ORDER BY queries: ``` sql - SELECT * - FROM table_name [WHERE ...] - ORDER BY DistanceFunction(Column, Point) + SELECT * + FROM table [WHERE ...] + ORDER BY DistanceFunction(column, Point) LIMIT N ``` -In these queries, `DistanceFunction` is selected from [distance functions](/docs/en/sql-reference/functions/distance-functions.md). `Point` is a known vector (something like `(0.1, 0.1, ... )`). To avoid writing large vectors, use [client parameters](/docs/en//interfaces/cli.md#queries-with-parameters-cli-queries-with-parameters). `Value` - a float value that will bound the neighbourhood. +`DistanceFunction` is a [distance functions](/docs/en/sql-reference/functions/distance-functions.md), `Point` is a given vector (e.g. `(0.17, 0.33, ...)`) and `MaxDistance` is a float value which restricts the size of the neighbourhood. -:::note -ANN index can't speed up query that satisfies both types (`where + order by`, only one of them). All queries must have the limit, as algorithms are used to find nearest neighbors and need a specific number of them. -::: +To avoid writing large vectors, you can also use [query parameters](/docs/en//interfaces/cli.md#queries-with-parameters-cli-queries-with-parameters), e.g. -:::note -Indexes are applied only to queries with a limit less than the `max_limit_for_ann_queries` setting. This helps to avoid memory overflows in queries with a large limit. `max_limit_for_ann_queries` setting can be changed if you know you can provide enough memory. The default value is `1000000`. -::: +```bash +clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Distance(embedding, {vec: Array(Float32)}) < 1.0" +``` -Both types of queries are handled the same way. The indexes get `n` neighbors (where `n` is taken from the `LIMIT` clause) and work with them. In `ORDER BY` query they remember the numbers of all parts of the granule that have at least one of neighbor. In `WHERE` query they remember only those parts that satisfy the requirements. +ANN index cannot speed up query that contain both `WHERE` and `ORDER BY`. Queries must have a limit, as the approximate algorithms used to determine the nearest neighbors require a specific number of them. + +Indexes are only used for queries with a `LIMIT` value smaller than setting `max_limit_for_ann_queries` (default: 1 million rows). This helps to prevent memory overflows in queries with a large limit. + +Both types of queries are processed similarly. The indexes are passed the number of neighbors `N`. In `ORDER BY` query they remember the numbers of all parts of the granule that have at least one of neighbor. In `WHERE` query they remember only those parts that satisfy the requirements. -## Create table with ANNIndex +## Creating Tables with an ANN Index -This feature is disabled by default. To enable it, set `allow_experimental_annoy_index` to 1. Also, this feature is disabled on ARM, due to likely problems with the algorithm. +As long as ANN indexes are experimental, you first need to `SET allow_experimental_annoy_index = 1`. + +Syntax: ```sql -CREATE TABLE t +CREATE TABLE table ( `id` Int64, - `data` Tuple(Float32, Float32, Float32), - INDEX ann_index_name data TYPE ann_index_type(ann_index_parameters) GRANULARITY N + `embedding` Tuple(Float32, Float32, Float32), + INDEX embedding TYPE () GRANULARITY N ) ENGINE = MergeTree ORDER BY id; ``` ```sql -CREATE TABLE t +CREATE TABLE table ( `id` Int64, - `data` Array(Float32), - INDEX ann_index_name data TYPE ann_index_type(ann_index_parameters) GRANULARITY N + `embedding` Array(Float32), + INDEX embedding TYPE () GRANULARITY N ) ENGINE = MergeTree ORDER BY id; @@ -80,69 +91,74 @@ ORDER BY id; With greater `GRANULARITY` indexes remember the data structure better. The `GRANULARITY` indicates how many granules will be used to construct the index. The more data is provided for the index, the more of it can be handled by one index and the more chances that with the right hyper parameters the index will remember the data structure better. But some indexes can't be built if they don't have enough data, so this granule will always participate in the query. For more information, see the description of indexes. -As the indexes are built only during insertions into table, `INSERT` and `OPTIMIZE` queries are slower than for ordinary table. At this stage indexes remember all the information about the given data. ANNIndexes should be used if you have immutable or rarely changed data and many read requests. - -You can create your table with index which uses certain algorithm. Now only indices based on the following algorithms are supported: +Note that ANN indexes are built during column insertion and merge, i.e. `INSERT` and `OPTIMIZE` statements are slower than for ordinary tables. ANNIndexes are ideally used only with immutable or rarely changing data in conjunction with many read requests. # Index list + - [Annoy](/docs/en/engines/table-engines/mergetree-family/annindexes.md#annoy-annoy) # Annoy {#annoy} -Implementation of the algorithm was taken from [this repository](https://github.com/spotify/annoy). + +(currently disabled on ARM due to problems with the algorithm) + +This ANN index type implements [Annoy indexes](https://github.com/spotify/annoy). Short description of the algorithm: The algorithm recursively divides in half all space by random linear surfaces (lines in 2D, planes in 3D etc.). Thus it makes tree of polyhedrons and points that they contains. Repeating the operation several times for greater accuracy it creates a forest. To find K Nearest Neighbours it goes down through the trees and fills the buffer of closest points using the priority queue of polyhedrons. Next, it sorts buffer and return the nearest K points. -__Examples__: +Examples: + ```sql -CREATE TABLE t +CREATE TABLE table ( id Int64, - data Tuple(Float32, Float32, Float32), - INDEX ann_index_name data TYPE annoy(NumTrees, DistanceName) GRANULARITY N + embedding Tuple(Float32, Float32, Float32), + INDEX embedding TYPE annoy([DistanceName[, NumTrees]]) GRANULARITY N ) ENGINE = MergeTree ORDER BY id; ``` ```sql -CREATE TABLE t +CREATE TABLE table ( id Int64, - data Array(Float32), - INDEX ann_index_name data TYPE annoy(NumTrees, DistanceName) GRANULARITY N + embedding Array(Float32), + INDEX embedding TYPE annoy([DistanceName[, NumTrees]]) GRANULARITY N ) ENGINE = MergeTree ORDER BY id; ``` :::note -Table with array field will work faster, but all arrays **must** have same length. Use [CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints) to avoid errors. For example, `CONSTRAINT constraint_name_1 CHECK length(data) = 256`. +Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays **must** have same length. Use [CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints) to avoid errors. For example, `CONSTRAINT constraint_name_1 CHECK length(embedding) = 256`. ::: -Parameter `NumTrees` is the number of trees which the algorithm will create. The bigger it is, the slower (approximately linear) it works (in both `CREATE` and `SELECT` requests), but the better accuracy you get (adjusted for randomness). By default it is set to `100`. Parameter `DistanceName` is name of distance function. By default it is set to `L2Distance`. It can be set without changing first parameter, for example +Parameter `DistanceName` is name of a distance function with default `L2Distance`. Parameter `NumTrees` (default: 100) is the number of trees which the algorithm will create. Higher values of `NumTree` mean slower `CREATE` and `SELECT` statements (approximately linearly), but increase the accuracy of search results. + ```sql -CREATE TABLE t +CREATE TABLE table ( id Int64, - data Array(Float32), - INDEX ann_index_name data TYPE annoy('cosineDistance') GRANULARITY N + embedding Array(Float32), + INDEX ann_index_name embedding TYPE annoy('cosineDistance') GRANULARITY N ) ENGINE = MergeTree ORDER BY id; ``` -Annoy supports `L2Distance` and `cosineDistance`. +Annoy currently supports `L2Distance` and `cosineDistance` as distance functions. -Setting `search_k` (default `LIMIT * NumTrees`) determines how many nodes the Annoy index will inspect during SELECT queries. The setting -can be used to balance performance and accuracy at runtime. +Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many nodes are inspected during SELECTs. It can be used to +balance runtime and accuracy at runtime. + +Example: -__Example__: ``` sql -SELECT * -FROM table_name [WHERE ...] -ORDER BY L2Distance(Column, Point) +SELECT * +FROM table_name [WHERE ...] +ORDER BY L2Distance(column, Point) LIMIT N -SETTING ann_index_select_query_params=`k_search=100` +SETTINGS annoy_index_search_k_nodes=100 ``` From 662b0fb822e6e297c1e490fef8e2c09e22f4af09 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 21:31:33 +0000 Subject: [PATCH 0439/1072] Cosmetics: Unwrap common ANN code from nested namespace --- src/Storages/MergeTree/CommonANNIndexes.cpp | 75 +++++++++---------- src/Storages/MergeTree/CommonANNIndexes.h | 27 +++---- .../MergeTree/MergeTreeDataSelectExecutor.cpp | 2 +- .../MergeTree/MergeTreeIndexAnnoy.cpp | 3 +- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 4 +- 5 files changed, 50 insertions(+), 61 deletions(-) diff --git a/src/Storages/MergeTree/CommonANNIndexes.cpp b/src/Storages/MergeTree/CommonANNIndexes.cpp index 4748c869f83..5c42774fb24 100644 --- a/src/Storages/MergeTree/CommonANNIndexes.cpp +++ b/src/Storages/MergeTree/CommonANNIndexes.cpp @@ -24,10 +24,8 @@ namespace ErrorCodes namespace { -namespace ANN = ApproximateNearestNeighbour; - template -void extractTargetVectorFromLiteral(ANN::ANNQueryInformation::Embedding & target, Literal literal) +void extractTargetVectorFromLiteral(ApproximateNearestNeighborInformation::Embedding & target, Literal literal) { Float64 float_element_of_target_vector; Int64 int_element_of_target_vector; @@ -43,28 +41,25 @@ void extractTargetVectorFromLiteral(ANN::ANNQueryInformation::Embedding & target } } -ANN::ANNQueryInformation::Metric castMetricFromStringToType(String metric_name) +ApproximateNearestNeighborInformation::Metric castMetricFromStringToType(String metric_name) { if (metric_name == "L2Distance") - return ANN::ANNQueryInformation::Metric::L2; + return ApproximateNearestNeighborInformation::Metric::L2; if (metric_name == "LpDistance") - return ANN::ANNQueryInformation::Metric::Lp; - return ANN::ANNQueryInformation::Metric::Unknown; + return ApproximateNearestNeighborInformation::Metric::Lp; + return ApproximateNearestNeighborInformation::Metric::Unknown; } } -namespace ApproximateNearestNeighbour -{ - -ANNCondition::ANNCondition(const SelectQueryInfo & query_info, +ApproximateNearestNeighborCondition::ApproximateNearestNeighborCondition(const SelectQueryInfo & query_info, ContextPtr context) : block_with_constants{KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)}, index_granularity{context->getMergeTreeSettings().get("index_granularity").get()}, limit_restriction{context->getSettings().get("max_limit_for_ann_queries").get()}, index_is_useful{checkQueryStructure(query_info)} {} -bool ANNCondition::alwaysUnknownOrTrue(String metric_name) const +bool ApproximateNearestNeighborCondition::alwaysUnknownOrTrue(String metric_name) const { if (!index_is_useful) return true; // Query isn't supported @@ -72,64 +67,64 @@ bool ANNCondition::alwaysUnknownOrTrue(String metric_name) const return !(castMetricFromStringToType(metric_name) == query_information->metric); } -float ANNCondition::getComparisonDistanceForWhereQuery() const +float ApproximateNearestNeighborCondition::getComparisonDistanceForWhereQuery() const { if (index_is_useful && query_information.has_value() - && query_information->query_type == ANNQueryInformation::Type::Where) + && query_information->query_type == ApproximateNearestNeighborInformation::Type::Where) return query_information->distance; throw Exception(ErrorCodes::LOGICAL_ERROR, "Not supported method for this query type"); } -UInt64 ANNCondition::getLimit() const +UInt64 ApproximateNearestNeighborCondition::getLimit() const { if (index_is_useful && query_information.has_value()) return query_information->limit; throw Exception(ErrorCodes::LOGICAL_ERROR, "No LIMIT section in query, not supported"); } -std::vector ANNCondition::getTargetVector() const +std::vector ApproximateNearestNeighborCondition::getTargetVector() const { if (index_is_useful && query_information.has_value()) return query_information->target; throw Exception(ErrorCodes::LOGICAL_ERROR, "Target vector was requested for useless or uninitialized index."); } -size_t ANNCondition::getNumOfDimensions() const +size_t ApproximateNearestNeighborCondition::getNumOfDimensions() const { if (index_is_useful && query_information.has_value()) return query_information->target.size(); throw Exception(ErrorCodes::LOGICAL_ERROR, "Number of dimensions was requested for useless or uninitialized index."); } -String ANNCondition::getColumnName() const +String ApproximateNearestNeighborCondition::getColumnName() const { if (index_is_useful && query_information.has_value()) return query_information->column_name; throw Exception(ErrorCodes::LOGICAL_ERROR, "Column name was requested for useless or uninitialized index."); } -ANNQueryInformation::Metric ANNCondition::getMetricType() const +ApproximateNearestNeighborInformation::Metric ApproximateNearestNeighborCondition::getMetricType() const { if (index_is_useful && query_information.has_value()) return query_information->metric; throw Exception(ErrorCodes::LOGICAL_ERROR, "Metric name was requested for useless or uninitialized index."); } -float ANNCondition::getPValueForLpDistance() const +float ApproximateNearestNeighborCondition::getPValueForLpDistance() const { if (index_is_useful && query_information.has_value()) return query_information->p_for_lp_dist; throw Exception(ErrorCodes::LOGICAL_ERROR, "P from LPDistance was requested for useless or uninitialized index."); } -ANNQueryInformation::Type ANNCondition::getQueryType() const +ApproximateNearestNeighborInformation::Type ApproximateNearestNeighborCondition::getQueryType() const { if (index_is_useful && query_information.has_value()) return query_information->query_type; throw Exception(ErrorCodes::LOGICAL_ERROR, "Query type was requested for useless or uninitialized index."); } -bool ANNCondition::checkQueryStructure(const SelectQueryInfo & query) +bool ApproximateNearestNeighborCondition::checkQueryStructure(const SelectQueryInfo & query) { // RPN-s for different sections of the query RPN rpn_prewhere_clause; @@ -138,9 +133,9 @@ bool ANNCondition::checkQueryStructure(const SelectQueryInfo & query) RPNElement rpn_limit; UInt64 limit; - ANNQueryInformation prewhere_info; - ANNQueryInformation where_info; - ANNQueryInformation order_by_info; + ApproximateNearestNeighborInformation prewhere_info; + ApproximateNearestNeighborInformation where_info; + ApproximateNearestNeighborInformation order_by_info; // Build rpns for query sections const auto & select = query.query->as(); @@ -195,7 +190,7 @@ bool ANNCondition::checkQueryStructure(const SelectQueryInfo & query) return query_information.has_value(); } -void ANNCondition::traverseAST(const ASTPtr & node, RPN & rpn) +void ApproximateNearestNeighborCondition::traverseAST(const ASTPtr & node, RPN & rpn) { // If the node is ASTFunction, it may have children nodes if (const auto * func = node->as()) @@ -214,7 +209,7 @@ void ANNCondition::traverseAST(const ASTPtr & node, RPN & rpn) rpn.emplace_back(std::move(element)); } -bool ANNCondition::traverseAtomAST(const ASTPtr & node, RPNElement & out) +bool ApproximateNearestNeighborCondition::traverseAtomAST(const ASTPtr & node, RPNElement & out) { // Match Functions if (const auto * function = node->as()) @@ -259,7 +254,7 @@ bool ANNCondition::traverseAtomAST(const ASTPtr & node, RPNElement & out) return tryCastToConstType(node, out); } -bool ANNCondition::tryCastToConstType(const ASTPtr & node, RPNElement & out) +bool ApproximateNearestNeighborCondition::tryCastToConstType(const ASTPtr & node, RPNElement & out) { Field const_value; DataTypePtr const_type; @@ -318,18 +313,18 @@ bool ANNCondition::tryCastToConstType(const ASTPtr & node, RPNElement & out) return false; } -void ANNCondition::traverseOrderByAST(const ASTPtr & node, RPN & rpn) +void ApproximateNearestNeighborCondition::traverseOrderByAST(const ASTPtr & node, RPN & rpn) { if (const auto * expr_list = node->as()) if (const auto * order_by_element = expr_list->children.front()->as()) traverseAST(order_by_element->children.front(), rpn); } -// Returns true and stores ANNQueryInformation if the query has valid WHERE clause -bool ANNCondition::matchRPNWhere(RPN & rpn, ANNQueryInformation & expr) +// Returns true and stores ApproximateNearestNeighborInformation if the query has valid WHERE clause +bool ApproximateNearestNeighborCondition::matchRPNWhere(RPN & rpn, ApproximateNearestNeighborInformation & expr) { /// Fill query type field - expr.query_type = ANNQueryInformation::Type::Where; + expr.query_type = ApproximateNearestNeighborInformation::Type::Where; // WHERE section must have at least 5 expressions // Operator->Distance(float)->DistanceFunc->Column->Tuple(Array)Func(TargetVector(floats)) @@ -381,10 +376,10 @@ bool ANNCondition::matchRPNWhere(RPN & rpn, ANNQueryInformation & expr) } // Returns true and stores ANNExpr if the query has valid ORDERBY clause -bool ANNCondition::matchRPNOrderBy(RPN & rpn, ANNQueryInformation & expr) +bool ApproximateNearestNeighborCondition::matchRPNOrderBy(RPN & rpn, ApproximateNearestNeighborInformation & expr) { /// Fill query type field - expr.query_type = ANNQueryInformation::Type::OrderBy; + expr.query_type = ApproximateNearestNeighborInformation::Type::OrderBy; // ORDER BY clause must have at least 3 expressions if (rpn.size() < 3) @@ -393,11 +388,11 @@ bool ANNCondition::matchRPNOrderBy(RPN & rpn, ANNQueryInformation & expr) auto iter = rpn.begin(); auto end = rpn.end(); - return ANNCondition::matchMainParts(iter, end, expr); + return ApproximateNearestNeighborCondition::matchMainParts(iter, end, expr); } // Returns true and stores Length if we have valid LIMIT clause in query -bool ANNCondition::matchRPNLimit(RPNElement & rpn, UInt64 & limit) +bool ApproximateNearestNeighborCondition::matchRPNLimit(RPNElement & rpn, UInt64 & limit) { if (rpn.function == RPNElement::FUNCTION_INT_LITERAL) { @@ -409,7 +404,7 @@ bool ANNCondition::matchRPNLimit(RPNElement & rpn, UInt64 & limit) } /* Matches dist function, target vector, column name */ -bool ANNCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ANNQueryInformation & expr) +bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ApproximateNearestNeighborInformation & expr) { bool identifier_found = false; @@ -420,7 +415,7 @@ bool ANNCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & en expr.metric = castMetricFromStringToType(iter->func_name); ++iter; - if (expr.metric == ANN::ANNQueryInformation::Metric::Lp) + if (expr.metric == ApproximateNearestNeighborInformation::Metric::Lp) { if (iter->function != RPNElement::FUNCTION_FLOAT_LITERAL && iter->function != RPNElement::FUNCTION_INT_LITERAL) @@ -497,7 +492,7 @@ bool ANNCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & en } // Gets float or int from AST node -float ANNCondition::getFloatOrIntLiteralOrPanic(const RPN::iterator& iter) +float ApproximateNearestNeighborCondition::getFloatOrIntLiteralOrPanic(const RPN::iterator& iter) { if (iter->float_literal.has_value()) return iter->float_literal.value(); @@ -507,5 +502,3 @@ float ANNCondition::getFloatOrIntLiteralOrPanic(const RPN::iterator& iter) } } - -} diff --git a/src/Storages/MergeTree/CommonANNIndexes.h b/src/Storages/MergeTree/CommonANNIndexes.h index 4253bce703a..37695586515 100644 --- a/src/Storages/MergeTree/CommonANNIndexes.h +++ b/src/Storages/MergeTree/CommonANNIndexes.h @@ -9,9 +9,6 @@ namespace DB { -namespace ApproximateNearestNeighbour -{ - /** * Queries for Approximate Nearest Neighbour Search * have similar structure: @@ -25,7 +22,7 @@ namespace ApproximateNearestNeighbour * 1) p for LpDistance function * 2) distance to compare with (only for where queries) */ -struct ANNQueryInformation +struct ApproximateNearestNeighborInformation { using Embedding = std::vector; @@ -51,7 +48,7 @@ struct ANNQueryInformation }; /** - Class ANNCondition, is responsible for recognizing special query types which + Class ApproximateNearestNeighborCondition, is responsible for recognizing special query types which can be speeded up by ANN Indexes. It parses the SQL query and checks if it matches ANNIndexes. The recognizing method - alwaysUnknownOrTrue returns false if we can speed up the query, and true otherwise. @@ -87,10 +84,10 @@ struct ANNQueryInformation Search query type is also recognized for PREWHERE clause */ -class ANNCondition +class ApproximateNearestNeighborCondition { public: - ANNCondition(const SelectQueryInfo & query_info, + ApproximateNearestNeighborCondition(const SelectQueryInfo & query_info, ContextPtr context); // false if query can be speeded up, true otherwise @@ -107,12 +104,12 @@ public: String getColumnName() const; - ANNQueryInformation::Metric getMetricType() const; + ApproximateNearestNeighborInformation::Metric getMetricType() const; // the P- value if the metric is 'LpDistance' float getPValueForLpDistance() const; - ANNQueryInformation::Type getQueryType() const; + ApproximateNearestNeighborInformation::Type getQueryType() const; UInt64 getIndexGranularity() const { return index_granularity; } @@ -191,16 +188,16 @@ private: void traverseOrderByAST(const ASTPtr & node, RPN & rpn); // Returns true and stores ANNExpr if the query has valid WHERE section - static bool matchRPNWhere(RPN & rpn, ANNQueryInformation & expr); + static bool matchRPNWhere(RPN & rpn, ApproximateNearestNeighborInformation & expr); // Returns true and stores ANNExpr if the query has valid ORDERBY section - static bool matchRPNOrderBy(RPN & rpn, ANNQueryInformation & expr); + static bool matchRPNOrderBy(RPN & rpn, ApproximateNearestNeighborInformation & expr); // Returns true and stores Length if we have valid LIMIT clause in query static bool matchRPNLimit(RPNElement & rpn, UInt64 & limit); /* Matches dist function, target vector, column name */ - static bool matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ANNQueryInformation & expr); + static bool matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ApproximateNearestNeighborInformation & expr); // Gets float or int from AST node static float getFloatOrIntLiteralOrPanic(const RPN::iterator& iter); @@ -208,7 +205,7 @@ private: Block block_with_constants; // true if we have one of two supported query types - std::optional query_information; + std::optional query_information; // Get from settings ANNIndex parameters UInt64 index_granularity; @@ -218,12 +215,10 @@ private: }; // condition interface for Ann indexes. Returns vector of indexes of ranges in granule which are useful for query. -class IMergeTreeIndexConditionAnn : public IMergeTreeIndexCondition +class IMergeTreeIndexConditionApproximateNearestNeighbor : public IMergeTreeIndexCondition { public: virtual std::vector getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const = 0; }; } - -} diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index aa340d6afc1..3d290ea12ac 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1687,7 +1687,7 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex( if (index_mark != index_range.begin || !granule || last_index_mark != index_range.begin) granule = reader.read(); // Cast to Ann condition - auto ann_condition = std::dynamic_pointer_cast(condition); + auto ann_condition = std::dynamic_pointer_cast(condition); if (ann_condition != nullptr) { // vector of indexes of useful ranges diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index fe3ea322b91..190f76fba5e 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -240,7 +240,7 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI { UInt64 limit = condition.getLimit(); UInt64 index_granularity = condition.getIndexGranularity(); - std::optional comp_dist = condition.getQueryType() == ApproximateNearestNeighbour::ANNQueryInformation::Type::Where + std::optional comp_dist = condition.getQueryType() == ApproximateNearestNeighborInformation::Type::Where ? std::optional(condition.getComparisonDistanceForWhereQuery()) : std::nullopt; @@ -267,6 +267,7 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI distances.reserve(limit); annoy->get_nns_by_vector(target_vec.data(), limit, static_cast(search_k), &neighbors, &distances); + std::unordered_set granule_numbers; for (size_t i = 0; i < neighbors.size(); ++i) { diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index fbc6b21fa6b..95041ea31fb 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -60,7 +60,7 @@ struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator }; -class MergeTreeIndexConditionAnnoy final : public ApproximateNearestNeighbour::IMergeTreeIndexConditionAnn +class MergeTreeIndexConditionAnnoy final : public IMergeTreeIndexConditionApproximateNearestNeighbor { public: MergeTreeIndexConditionAnnoy( @@ -81,7 +81,7 @@ private: template std::vector getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const; - const ApproximateNearestNeighbour::ANNCondition condition; + const ApproximateNearestNeighborCondition condition; const String distance_function; const Int64 search_k; }; From 999e4c33065279f4337387ece0343da36ca03098 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 21:37:20 +0000 Subject: [PATCH 0440/1072] Cosmetics: Less generic variable naming --- src/Storages/MergeTree/CommonANNIndexes.cpp | 52 ++++++++++----------- src/Storages/MergeTree/CommonANNIndexes.h | 6 +-- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/Storages/MergeTree/CommonANNIndexes.cpp b/src/Storages/MergeTree/CommonANNIndexes.cpp index 5c42774fb24..669055068b8 100644 --- a/src/Storages/MergeTree/CommonANNIndexes.cpp +++ b/src/Storages/MergeTree/CommonANNIndexes.cpp @@ -321,10 +321,10 @@ void ApproximateNearestNeighborCondition::traverseOrderByAST(const ASTPtr & node } // Returns true and stores ApproximateNearestNeighborInformation if the query has valid WHERE clause -bool ApproximateNearestNeighborCondition::matchRPNWhere(RPN & rpn, ApproximateNearestNeighborInformation & expr) +bool ApproximateNearestNeighborCondition::matchRPNWhere(RPN & rpn, ApproximateNearestNeighborInformation & ann_info) { /// Fill query type field - expr.query_type = ApproximateNearestNeighborInformation::Type::Where; + ann_info.query_type = ApproximateNearestNeighborInformation::Type::Where; // WHERE section must have at least 5 expressions // Operator->Distance(float)->DistanceFunc->Column->Tuple(Array)Func(TargetVector(floats)) @@ -347,9 +347,9 @@ bool ApproximateNearestNeighborCondition::matchRPNWhere(RPN & rpn, ApproximateNe if (iter->function != RPNElement::FUNCTION_FLOAT_LITERAL) return false; - expr.distance = getFloatOrIntLiteralOrPanic(iter); - if (expr.distance < 0) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance can't be negative. Got {}", expr.distance); + ann_info.distance = getFloatOrIntLiteralOrPanic(iter); + if (ann_info.distance < 0) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance can't be negative. Got {}", ann_info.distance); ++iter; @@ -358,17 +358,17 @@ bool ApproximateNearestNeighborCondition::matchRPNWhere(RPN & rpn, ApproximateNe return false; auto end = rpn.end(); - if (!matchMainParts(iter, end, expr)) + if (!matchMainParts(iter, end, ann_info)) return false; if (greater_case) { - if (expr.target.size() < 2) + if (ann_info.target.size() < 2) return false; - expr.distance = expr.target.back(); - if (expr.distance < 0) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance can't be negative. Got {}", expr.distance); - expr.target.pop_back(); + ann_info.distance = ann_info.target.back(); + if (ann_info.distance < 0) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance can't be negative. Got {}", ann_info.distance); + ann_info.target.pop_back(); } // query is ok @@ -376,10 +376,10 @@ bool ApproximateNearestNeighborCondition::matchRPNWhere(RPN & rpn, ApproximateNe } // Returns true and stores ANNExpr if the query has valid ORDERBY clause -bool ApproximateNearestNeighborCondition::matchRPNOrderBy(RPN & rpn, ApproximateNearestNeighborInformation & expr) +bool ApproximateNearestNeighborCondition::matchRPNOrderBy(RPN & rpn, ApproximateNearestNeighborInformation & ann_info) { /// Fill query type field - expr.query_type = ApproximateNearestNeighborInformation::Type::OrderBy; + ann_info.query_type = ApproximateNearestNeighborInformation::Type::OrderBy; // ORDER BY clause must have at least 3 expressions if (rpn.size() < 3) @@ -388,7 +388,7 @@ bool ApproximateNearestNeighborCondition::matchRPNOrderBy(RPN & rpn, Approximate auto iter = rpn.begin(); auto end = rpn.end(); - return ApproximateNearestNeighborCondition::matchMainParts(iter, end, expr); + return ApproximateNearestNeighborCondition::matchMainParts(iter, end, ann_info); } // Returns true and stores Length if we have valid LIMIT clause in query @@ -404,7 +404,7 @@ bool ApproximateNearestNeighborCondition::matchRPNLimit(RPNElement & rpn, UInt64 } /* Matches dist function, target vector, column name */ -bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ApproximateNearestNeighborInformation & expr) +bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ApproximateNearestNeighborInformation & ann_info) { bool identifier_found = false; @@ -412,22 +412,22 @@ bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, c if (iter->function != RPNElement::FUNCTION_DISTANCE) return false; - expr.metric = castMetricFromStringToType(iter->func_name); + ann_info.metric = castMetricFromStringToType(iter->func_name); ++iter; - if (expr.metric == ApproximateNearestNeighborInformation::Metric::Lp) + if (ann_info.metric == ApproximateNearestNeighborInformation::Metric::Lp) { if (iter->function != RPNElement::FUNCTION_FLOAT_LITERAL && iter->function != RPNElement::FUNCTION_INT_LITERAL) return false; - expr.p_for_lp_dist = getFloatOrIntLiteralOrPanic(iter); + ann_info.p_for_lp_dist = getFloatOrIntLiteralOrPanic(iter); ++iter; } if (iter->function == RPNElement::FUNCTION_IDENTIFIER) { identifier_found = true; - expr.column_name = std::move(iter->identifier.value()); + ann_info.column_name = std::move(iter->identifier.value()); ++iter; } @@ -436,13 +436,13 @@ bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, c if (iter->function == RPNElement::FUNCTION_LITERAL_TUPLE) { - extractTargetVectorFromLiteral(expr.target, iter->tuple_literal); + extractTargetVectorFromLiteral(ann_info.target, iter->tuple_literal); ++iter; } if (iter->function == RPNElement::FUNCTION_LITERAL_ARRAY) { - extractTargetVectorFromLiteral(expr.target, iter->array_literal); + extractTargetVectorFromLiteral(ann_info.target, iter->array_literal); ++iter; } @@ -457,12 +457,12 @@ bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, c ++iter; if (iter->function == RPNElement::FUNCTION_LITERAL_TUPLE) { - extractTargetVectorFromLiteral(expr.target, iter->tuple_literal); + extractTargetVectorFromLiteral(ann_info.target, iter->tuple_literal); ++iter; } else if (iter->function == RPNElement::FUNCTION_LITERAL_ARRAY) { - extractTargetVectorFromLiteral(expr.target, iter->array_literal); + extractTargetVectorFromLiteral(ann_info.target, iter->array_literal); ++iter; } else @@ -473,12 +473,12 @@ bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, c { if (iter->function == RPNElement::FUNCTION_FLOAT_LITERAL || iter->function == RPNElement::FUNCTION_INT_LITERAL) - expr.target.emplace_back(getFloatOrIntLiteralOrPanic(iter)); + ann_info.target.emplace_back(getFloatOrIntLiteralOrPanic(iter)); else if (iter->function == RPNElement::FUNCTION_IDENTIFIER) { if (identifier_found) return false; - expr.column_name = std::move(iter->identifier.value()); + ann_info.column_name = std::move(iter->identifier.value()); identifier_found = true; } else @@ -488,7 +488,7 @@ bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, c } // Final checks of correctness - return identifier_found && !expr.target.empty(); + return identifier_found && !ann_info.target.empty(); } // Gets float or int from AST node diff --git a/src/Storages/MergeTree/CommonANNIndexes.h b/src/Storages/MergeTree/CommonANNIndexes.h index 37695586515..0b207585048 100644 --- a/src/Storages/MergeTree/CommonANNIndexes.h +++ b/src/Storages/MergeTree/CommonANNIndexes.h @@ -188,16 +188,16 @@ private: void traverseOrderByAST(const ASTPtr & node, RPN & rpn); // Returns true and stores ANNExpr if the query has valid WHERE section - static bool matchRPNWhere(RPN & rpn, ApproximateNearestNeighborInformation & expr); + static bool matchRPNWhere(RPN & rpn, ApproximateNearestNeighborInformation & ann_info); // Returns true and stores ANNExpr if the query has valid ORDERBY section - static bool matchRPNOrderBy(RPN & rpn, ApproximateNearestNeighborInformation & expr); + static bool matchRPNOrderBy(RPN & rpn, ApproximateNearestNeighborInformation & ann_info); // Returns true and stores Length if we have valid LIMIT clause in query static bool matchRPNLimit(RPNElement & rpn, UInt64 & limit); /* Matches dist function, target vector, column name */ - static bool matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ApproximateNearestNeighborInformation & expr); + static bool matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ApproximateNearestNeighborInformation & ann_info); // Gets float or int from AST node static float getFloatOrIntLiteralOrPanic(const RPN::iterator& iter); From 567d54a26848ba0f5bf4ad38d226def02a92280b Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 21:38:00 +0000 Subject: [PATCH 0441/1072] Cosmetics: more constness --- src/Storages/MergeTree/CommonANNIndexes.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/CommonANNIndexes.h b/src/Storages/MergeTree/CommonANNIndexes.h index 0b207585048..68ed217ef45 100644 --- a/src/Storages/MergeTree/CommonANNIndexes.h +++ b/src/Storages/MergeTree/CommonANNIndexes.h @@ -208,9 +208,9 @@ private: std::optional query_information; // Get from settings ANNIndex parameters - UInt64 index_granularity; + const UInt64 index_granularity; /// only queries with a lower limit can be considered to avoid memory overflow - UInt64 limit_restriction; + const UInt64 limit_restriction; bool index_is_useful = false; }; From 6fe208832d6c3ba8340fd73d5728efad567188ba Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 21:42:46 +0000 Subject: [PATCH 0442/1072] Cosmetics: target vector --> reference vector --- src/Storages/MergeTree/CommonANNIndexes.cpp | 48 +++++++++---------- src/Storages/MergeTree/CommonANNIndexes.h | 22 ++++----- .../MergeTree/MergeTreeIndexAnnoy.cpp | 6 +-- 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/src/Storages/MergeTree/CommonANNIndexes.cpp b/src/Storages/MergeTree/CommonANNIndexes.cpp index 669055068b8..20707a148ae 100644 --- a/src/Storages/MergeTree/CommonANNIndexes.cpp +++ b/src/Storages/MergeTree/CommonANNIndexes.cpp @@ -25,19 +25,19 @@ namespace { template -void extractTargetVectorFromLiteral(ApproximateNearestNeighborInformation::Embedding & target, Literal literal) +void extraceReferenceVectorFromLiteral(ApproximateNearestNeighborInformation::Embedding & reference_vector, Literal literal) { - Float64 float_element_of_target_vector; - Int64 int_element_of_target_vector; + Float64 float_element_of_reference_vector; + Int64 int_element_of_reference_vector; for (const auto & value : literal.value()) { - if (value.tryGet(float_element_of_target_vector)) - target.emplace_back(float_element_of_target_vector); - else if (value.tryGet(int_element_of_target_vector)) - target.emplace_back(static_cast(int_element_of_target_vector)); + if (value.tryGet(float_element_of_reference_vector)) + reference_vector.emplace_back(float_element_of_reference_vector); + else if (value.tryGet(int_element_of_reference_vector)) + reference_vector.emplace_back(static_cast(int_element_of_reference_vector)); else - throw Exception(ErrorCodes::INCORRECT_QUERY, "Wrong type of elements in target vector. Only float or int are supported."); + throw Exception(ErrorCodes::INCORRECT_QUERY, "Wrong type of elements in reference vector. Only float or int are supported."); } } @@ -82,17 +82,17 @@ UInt64 ApproximateNearestNeighborCondition::getLimit() const throw Exception(ErrorCodes::LOGICAL_ERROR, "No LIMIT section in query, not supported"); } -std::vector ApproximateNearestNeighborCondition::getTargetVector() const +std::vector ApproximateNearestNeighborCondition::getReferenceVector() const { if (index_is_useful && query_information.has_value()) - return query_information->target; - throw Exception(ErrorCodes::LOGICAL_ERROR, "Target vector was requested for useless or uninitialized index."); + return query_information->reference_vector; + throw Exception(ErrorCodes::LOGICAL_ERROR, "Reference vector was requested for useless or uninitialized index."); } size_t ApproximateNearestNeighborCondition::getNumOfDimensions() const { if (index_is_useful && query_information.has_value()) - return query_information->target.size(); + return query_information->reference_vector.size(); throw Exception(ErrorCodes::LOGICAL_ERROR, "Number of dimensions was requested for useless or uninitialized index."); } @@ -327,7 +327,7 @@ bool ApproximateNearestNeighborCondition::matchRPNWhere(RPN & rpn, ApproximateNe ann_info.query_type = ApproximateNearestNeighborInformation::Type::Where; // WHERE section must have at least 5 expressions - // Operator->Distance(float)->DistanceFunc->Column->Tuple(Array)Func(TargetVector(floats)) + // Operator->Distance(float)->DistanceFunc->Column->Tuple(Array)Func(ReferenceVector(floats)) if (rpn.size() < 5) return false; @@ -363,12 +363,12 @@ bool ApproximateNearestNeighborCondition::matchRPNWhere(RPN & rpn, ApproximateNe if (greater_case) { - if (ann_info.target.size() < 2) + if (ann_info.reference_vector.size() < 2) return false; - ann_info.distance = ann_info.target.back(); + ann_info.distance = ann_info.reference_vector.back(); if (ann_info.distance < 0) throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance can't be negative. Got {}", ann_info.distance); - ann_info.target.pop_back(); + ann_info.reference_vector.pop_back(); } // query is ok @@ -403,12 +403,12 @@ bool ApproximateNearestNeighborCondition::matchRPNLimit(RPNElement & rpn, UInt64 return false; } -/* Matches dist function, target vector, column name */ +/* Matches dist function, referencer vector, column name */ bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ApproximateNearestNeighborInformation & ann_info) { bool identifier_found = false; - // Matches DistanceFunc->[Column]->[Tuple(array)Func]->TargetVector(floats)->[Column] + // Matches DistanceFunc->[Column]->[Tuple(array)Func]->ReferenceVector(floats)->[Column] if (iter->function != RPNElement::FUNCTION_DISTANCE) return false; @@ -436,13 +436,13 @@ bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, c if (iter->function == RPNElement::FUNCTION_LITERAL_TUPLE) { - extractTargetVectorFromLiteral(ann_info.target, iter->tuple_literal); + extraceReferenceVectorFromLiteral(ann_info.reference_vector, iter->tuple_literal); ++iter; } if (iter->function == RPNElement::FUNCTION_LITERAL_ARRAY) { - extractTargetVectorFromLiteral(ann_info.target, iter->array_literal); + extraceReferenceVectorFromLiteral(ann_info.reference_vector, iter->array_literal); ++iter; } @@ -457,12 +457,12 @@ bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, c ++iter; if (iter->function == RPNElement::FUNCTION_LITERAL_TUPLE) { - extractTargetVectorFromLiteral(ann_info.target, iter->tuple_literal); + extraceReferenceVectorFromLiteral(ann_info.reference_vector, iter->tuple_literal); ++iter; } else if (iter->function == RPNElement::FUNCTION_LITERAL_ARRAY) { - extractTargetVectorFromLiteral(ann_info.target, iter->array_literal); + extraceReferenceVectorFromLiteral(ann_info.reference_vector, iter->array_literal); ++iter; } else @@ -473,7 +473,7 @@ bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, c { if (iter->function == RPNElement::FUNCTION_FLOAT_LITERAL || iter->function == RPNElement::FUNCTION_INT_LITERAL) - ann_info.target.emplace_back(getFloatOrIntLiteralOrPanic(iter)); + ann_info.reference_vector.emplace_back(getFloatOrIntLiteralOrPanic(iter)); else if (iter->function == RPNElement::FUNCTION_IDENTIFIER) { if (identifier_found) @@ -488,7 +488,7 @@ bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, c } // Final checks of correctness - return identifier_found && !ann_info.target.empty(); + return identifier_found && !ann_info.reference_vector.empty(); } // Gets float or int from AST node diff --git a/src/Storages/MergeTree/CommonANNIndexes.h b/src/Storages/MergeTree/CommonANNIndexes.h index 68ed217ef45..9c075139707 100644 --- a/src/Storages/MergeTree/CommonANNIndexes.h +++ b/src/Storages/MergeTree/CommonANNIndexes.h @@ -12,7 +12,7 @@ namespace DB /** * Queries for Approximate Nearest Neighbour Search * have similar structure: - * 1) target vector from which all distances are calculated + * 1) reference vector from which all distances are calculated * 2) metric name (e.g L2Distance, LpDistance, etc.) * 3) name of column with embeddings * 4) type of query @@ -27,7 +27,7 @@ struct ApproximateNearestNeighborInformation using Embedding = std::vector; // Extracted data from valid query - Embedding target; + Embedding reference_vector; enum class Metric { Unknown, @@ -56,14 +56,14 @@ struct ApproximateNearestNeighborInformation There are two main patterns of queries being supported 1) Search query type - SELECT * FROM * WHERE DistanceFunc(column, target_vector) < floatLiteral LIMIT count + SELECT * FROM * WHERE DistanceFunc(column, reference) < floatLiteral LIMIT count 2) OrderBy query type - SELECT * FROM * WHERE * ORDERBY DistanceFunc(column, target_vector) LIMIT count + SELECT * FROM * WHERE * ORDERBY DistanceFunc(column, reference) LIMIT count *Query without LIMIT count is not supported* - target_vector(should have float coordinates) examples: + reference(should have float coordinates) examples: tuple(0.1, 0.1, ...., 0.1) or (0.1, 0.1, ...., 0.1) [the word tuple is not needed] @@ -72,11 +72,11 @@ struct ApproximateNearestNeighborInformation returns true. From matching query it extracts - * targetVector + * referenceVector * metricName(DistanceFunction) * dimension size if query uses LpDistance * distance to compare(ONLY for search types, otherwise you get exception) - * spaceDimension(which is targetVector's components count) + * spaceDimension(which is reference vector's components count) * column * objects count from LIMIT clause(for both queries) * queryHasOrderByClause and queryHasWhereClause return true if query matches the type @@ -96,10 +96,10 @@ public: // returns the distance to compare with for search query float getComparisonDistanceForWhereQuery() const; - // distance should be calculated regarding to targetVector - std::vector getTargetVector() const; + // distance should be calculated regarding to reference vector + std::vector getReferenceVector() const; - // targetVector dimension size + // reference vector's dimension size size_t getNumOfDimensions() const; String getColumnName() const; @@ -196,7 +196,7 @@ private: // Returns true and stores Length if we have valid LIMIT clause in query static bool matchRPNLimit(RPNElement & rpn, UInt64 & limit); - /* Matches dist function, target vector, column name */ + /* Matches dist function, reference vector, column name */ static bool matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ApproximateNearestNeighborInformation & ann_info); // Gets float or int from AST node diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 190f76fba5e..9dcfd421ba5 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -247,7 +247,7 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI if (comp_dist && comp_dist.value() < 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to optimize query with where without distance"); - std::vector target_vec = condition.getTargetVector(); + std::vector reference_vector = condition.getReferenceVector(); auto granule = std::dynamic_pointer_cast>(idx_granule); if (granule == nullptr) @@ -260,13 +260,13 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI "does not match with the dimension in the index ({})", toString(condition.getNumOfDimensions()), toString(annoy->getNumOfDimensions())); - /// neighbors contain indexes of dots which were closest to target vector + /// neighbors contain indexes of dots which were closest to the reference vector std::vector neighbors; std::vector distances; neighbors.reserve(limit); distances.reserve(limit); - annoy->get_nns_by_vector(target_vec.data(), limit, static_cast(search_k), &neighbors, &distances); + annoy->get_nns_by_vector(reference_vector.data(), limit, static_cast(search_k), &neighbors, &distances); std::unordered_set granule_numbers; for (size_t i = 0; i < neighbors.size(); ++i) From 8cc382121a3b7580136eaf7514957dd0e08f3283 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 21:48:36 +0000 Subject: [PATCH 0443/1072] Cosmetics: Update comments --- src/Storages/MergeTree/CommonANNIndexes.cpp | 57 ++++---- src/Storages/MergeTree/CommonANNIndexes.h | 152 ++++++++++---------- 2 files changed, 103 insertions(+), 106 deletions(-) diff --git a/src/Storages/MergeTree/CommonANNIndexes.cpp b/src/Storages/MergeTree/CommonANNIndexes.cpp index 20707a148ae..f4a0e9bf728 100644 --- a/src/Storages/MergeTree/CommonANNIndexes.cpp +++ b/src/Storages/MergeTree/CommonANNIndexes.cpp @@ -126,7 +126,7 @@ ApproximateNearestNeighborInformation::Type ApproximateNearestNeighborCondition: bool ApproximateNearestNeighborCondition::checkQueryStructure(const SelectQueryInfo & query) { - // RPN-s for different sections of the query + /// RPN-s for different sections of the query RPN rpn_prewhere_clause; RPN rpn_where_clause; RPN rpn_order_by_clause; @@ -137,47 +137,50 @@ bool ApproximateNearestNeighborCondition::checkQueryStructure(const SelectQueryI ApproximateNearestNeighborInformation where_info; ApproximateNearestNeighborInformation order_by_info; - // Build rpns for query sections + /// Build rpns for query sections const auto & select = query.query->as(); - if (select.prewhere()) // If query has PREWHERE clause + /// If query has PREWHERE clause + if (select.prewhere()) traverseAST(select.prewhere(), rpn_prewhere_clause); - if (select.where()) // If query has WHERE clause + /// If query has WHERE clause + if (select.where()) traverseAST(select.where(), rpn_where_clause); - if (select.limitLength()) // If query has LIMIT clause + /// If query has LIMIT clause + if (select.limitLength()) traverseAtomAST(select.limitLength(), rpn_limit); if (select.orderBy()) // If query has ORDERBY clause traverseOrderByAST(select.orderBy(), rpn_order_by_clause); - // Reverse RPNs for conveniences during parsing + /// Reverse RPNs for conveniences during parsing std::reverse(rpn_prewhere_clause.begin(), rpn_prewhere_clause.end()); std::reverse(rpn_where_clause.begin(), rpn_where_clause.end()); std::reverse(rpn_order_by_clause.begin(), rpn_order_by_clause.end()); - // Match rpns with supported types and extract information + /// Match rpns with supported types and extract information const bool prewhere_is_valid = matchRPNWhere(rpn_prewhere_clause, prewhere_info); const bool where_is_valid = matchRPNWhere(rpn_where_clause, where_info); const bool order_by_is_valid = matchRPNOrderBy(rpn_order_by_clause, order_by_info); const bool limit_is_valid = matchRPNLimit(rpn_limit, limit); - // Query without a LIMIT clause or with a limit greater than a restriction is not supported + /// Query without a LIMIT clause or with a limit greater than a restriction is not supported if (!limit_is_valid || limit_restriction < limit) return false; - // Search type query in both sections isn't supported + /// Search type query in both sections isn't supported if (prewhere_is_valid && where_is_valid) return false; - // Search type should be in WHERE or PREWHERE clause + /// Search type should be in WHERE or PREWHERE clause if (prewhere_is_valid || where_is_valid) query_information = std::move(prewhere_is_valid ? prewhere_info : where_info); if (order_by_is_valid) { - // Query with valid where and order by type is not supported + /// Query with valid where and order by type is not supported if (query_information.has_value()) return false; @@ -202,7 +205,7 @@ void ApproximateNearestNeighborCondition::traverseAST(const ASTPtr & node, RPN & } RPNElement element; - // Get the data behind node + /// Get the data behind node if (!traverseAtomAST(node, element)) element.function = RPNElement::FUNCTION_UNKNOWN; @@ -211,10 +214,10 @@ void ApproximateNearestNeighborCondition::traverseAST(const ASTPtr & node, RPN & bool ApproximateNearestNeighborCondition::traverseAtomAST(const ASTPtr & node, RPNElement & out) { - // Match Functions + /// Match Functions if (const auto * function = node->as()) { - // Set the name + /// Set the name out.func_name = function->name; if (function->name == "L1Distance" || @@ -240,7 +243,7 @@ bool ApproximateNearestNeighborCondition::traverseAtomAST(const ASTPtr & node, R return true; } - // Match identifier + /// Match identifier else if (const auto * identifier = node->as()) { out.function = RPNElement::FUNCTION_IDENTIFIER; @@ -250,7 +253,7 @@ bool ApproximateNearestNeighborCondition::traverseAtomAST(const ASTPtr & node, R return true; } - // Check if we have constants behind the node + /// Check if we have constants behind the node return tryCastToConstType(node, out); } @@ -320,20 +323,20 @@ void ApproximateNearestNeighborCondition::traverseOrderByAST(const ASTPtr & node traverseAST(order_by_element->children.front(), rpn); } -// Returns true and stores ApproximateNearestNeighborInformation if the query has valid WHERE clause +/// Returns true and stores ApproximateNearestNeighborInformation if the query has valid WHERE clause bool ApproximateNearestNeighborCondition::matchRPNWhere(RPN & rpn, ApproximateNearestNeighborInformation & ann_info) { /// Fill query type field ann_info.query_type = ApproximateNearestNeighborInformation::Type::Where; - // WHERE section must have at least 5 expressions - // Operator->Distance(float)->DistanceFunc->Column->Tuple(Array)Func(ReferenceVector(floats)) + /// WHERE section must have at least 5 expressions + /// Operator->Distance(float)->DistanceFunc->Column->Tuple(Array)Func(ReferenceVector(floats)) if (rpn.size() < 5) return false; auto iter = rpn.begin(); - // Query starts from operator less + /// Query starts from operator less if (iter->function != RPNElement::FUNCTION_COMPARISON) return false; @@ -371,11 +374,11 @@ bool ApproximateNearestNeighborCondition::matchRPNWhere(RPN & rpn, ApproximateNe ann_info.reference_vector.pop_back(); } - // query is ok + /// query is ok return true; } -// Returns true and stores ANNExpr if the query has valid ORDERBY clause +/// Returns true and stores ANNExpr if the query has valid ORDERBY clause bool ApproximateNearestNeighborCondition::matchRPNOrderBy(RPN & rpn, ApproximateNearestNeighborInformation & ann_info) { /// Fill query type field @@ -391,7 +394,7 @@ bool ApproximateNearestNeighborCondition::matchRPNOrderBy(RPN & rpn, Approximate return ApproximateNearestNeighborCondition::matchMainParts(iter, end, ann_info); } -// Returns true and stores Length if we have valid LIMIT clause in query +/// Returns true and stores Length if we have valid LIMIT clause in query bool ApproximateNearestNeighborCondition::matchRPNLimit(RPNElement & rpn, UInt64 & limit) { if (rpn.function == RPNElement::FUNCTION_INT_LITERAL) @@ -403,12 +406,12 @@ bool ApproximateNearestNeighborCondition::matchRPNLimit(RPNElement & rpn, UInt64 return false; } -/* Matches dist function, referencer vector, column name */ +/// Matches dist function, referencer vector, column name bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ApproximateNearestNeighborInformation & ann_info) { bool identifier_found = false; - // Matches DistanceFunc->[Column]->[Tuple(array)Func]->ReferenceVector(floats)->[Column] + /// Matches DistanceFunc->[Column]->[Tuple(array)Func]->ReferenceVector(floats)->[Column] if (iter->function != RPNElement::FUNCTION_DISTANCE) return false; @@ -487,11 +490,11 @@ bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, c ++iter; } - // Final checks of correctness + /// Final checks of correctness return identifier_found && !ann_info.reference_vector.empty(); } -// Gets float or int from AST node +/// Gets float or int from AST node float ApproximateNearestNeighborCondition::getFloatOrIntLiteralOrPanic(const RPN::iterator& iter) { if (iter->float_literal.has_value()) diff --git a/src/Storages/MergeTree/CommonANNIndexes.h b/src/Storages/MergeTree/CommonANNIndexes.h index 9c075139707..2ac9384a884 100644 --- a/src/Storages/MergeTree/CommonANNIndexes.h +++ b/src/Storages/MergeTree/CommonANNIndexes.h @@ -9,31 +9,31 @@ namespace DB { -/** - * Queries for Approximate Nearest Neighbour Search - * have similar structure: - * 1) reference vector from which all distances are calculated - * 2) metric name (e.g L2Distance, LpDistance, etc.) - * 3) name of column with embeddings - * 4) type of query - * 5) Number of elements, that should be taken (limit) - * - * And two optional parameters: - * 1) p for LpDistance function - * 2) distance to compare with (only for where queries) - */ +/// Approximate Nearest Neighbour queries have a similar structure: +/// - reference vector from which all distances are calculated +/// - metric name (e.g L2Distance, LpDistance, etc.) +/// - name of column with embeddings +/// - type of query +/// - maximum number of returned elements (LIMIT) +/// +/// And two optional parameters: +/// - p for LpDistance function +/// - distance to compare with (only for where queries) +/// +/// This struct holds all these components. struct ApproximateNearestNeighborInformation { using Embedding = std::vector; - - // Extracted data from valid query Embedding reference_vector; + enum class Metric { Unknown, L2, Lp - } metric; + }; + Metric metric; + String column_name; UInt64 limit; @@ -41,79 +41,71 @@ struct ApproximateNearestNeighborInformation { OrderBy, Where - } query_type; + }; + Type query_type; float p_for_lp_dist = -1.0; float distance = -1.0; }; -/** - Class ApproximateNearestNeighborCondition, is responsible for recognizing special query types which - can be speeded up by ANN Indexes. It parses the SQL query and checks - if it matches ANNIndexes. The recognizing method - alwaysUnknownOrTrue - returns false if we can speed up the query, and true otherwise. - It has only one argument, name of the metric with which index was built. - There are two main patterns of queries being supported - - 1) Search query type - SELECT * FROM * WHERE DistanceFunc(column, reference) < floatLiteral LIMIT count - - 2) OrderBy query type - SELECT * FROM * WHERE * ORDERBY DistanceFunc(column, reference) LIMIT count - - *Query without LIMIT count is not supported* - - reference(should have float coordinates) examples: - tuple(0.1, 0.1, ...., 0.1) or (0.1, 0.1, ...., 0.1) - [the word tuple is not needed] - - If the query matches one of these two types, than the class extracts useful information - from the query. If the query has both 1 and 2 types, than we can't speed and alwaysUnknownOrTrue - returns true. - - From matching query it extracts - * referenceVector - * metricName(DistanceFunction) - * dimension size if query uses LpDistance - * distance to compare(ONLY for search types, otherwise you get exception) - * spaceDimension(which is reference vector's components count) - * column - * objects count from LIMIT clause(for both queries) - * queryHasOrderByClause and queryHasWhereClause return true if query matches the type - - Search query type is also recognized for PREWHERE clause -*/ +// Class ANNCondition, is responsible for recognizing if the query is an ANN queries which can utilize ANN indexes. It parses the SQL query +/// and checks if it matches ANNIndexes. Method alwaysUnknownOrTrue returns false if we can speed up the query, and true otherwise. It has +/// only one argument, the name of the metric with which index was built. Two main patterns of queries are supported +/// +/// - 1. WHERE queries: +/// SELECT * FROM * WHERE DistanceFunc(column, reference_vector) < floatLiteral LIMIT count +/// +/// - 2. ORDER BY queries: +/// SELECT * FROM * WHERE * ORDER BY DistanceFunc(column, reference_vector) LIMIT count +/// +/// Queries without LIMIT count are not supported +/// If the query is both of type 1. and 2., than we can't use the index and alwaysUnknownOrTrue returns true. +/// reference_vector should have float coordinates, e.g. (0.2, 0.1, .., 0.5) +/// +/// If the query matches one of these two types, then this class extracts the main information needed for ANN indexes from the query. +/// +/// From matching query it extracts +/// - referenceVector +/// - metricName(DistanceFunction) +/// - dimension size if query uses LpDistance +/// - distance to compare(ONLY for search types, otherwise you get exception) +/// - spaceDimension(which is referenceVector's components count) +/// - column +/// - objects count from LIMIT clause(for both queries) +/// - queryHasOrderByClause and queryHasWhereClause return true if query matches the type +/// +/// Search query type is also recognized for PREWHERE clause class ApproximateNearestNeighborCondition { public: ApproximateNearestNeighborCondition(const SelectQueryInfo & query_info, ContextPtr context); - // false if query can be speeded up, true otherwise + /// Returns false if query can be speeded up by an ANN index, true otherwise. bool alwaysUnknownOrTrue(String metric_name) const; - // returns the distance to compare with for search query + /// Returns the distance to compare with for search query float getComparisonDistanceForWhereQuery() const; - // distance should be calculated regarding to reference vector + /// Distance should be calculated regarding to referenceVector std::vector getReferenceVector() const; - // reference vector's dimension size + /// Reference vector's dimension size size_t getNumOfDimensions() const; String getColumnName() const; ApproximateNearestNeighborInformation::Metric getMetricType() const; - // the P- value if the metric is 'LpDistance' + /// The P- value if the metric is 'LpDistance' float getPValueForLpDistance() const; ApproximateNearestNeighborInformation::Type getQueryType() const; UInt64 getIndexGranularity() const { return index_granularity; } - // length's value from LIMIT clause + /// Length's value from LIMIT clause UInt64 getLimit() const; private: @@ -121,7 +113,7 @@ private: { enum Function { - // DistanceFunctions + /// DistanceFunctions FUNCTION_DISTANCE, //tuple(0.1, ..., 0.1) @@ -130,31 +122,31 @@ private: //array(0.1, ..., 0.1) FUNCTION_ARRAY, - // Operators <, >, <=, >= + /// Operators <, >, <=, >= FUNCTION_COMPARISON, - // Numeric float value + /// Numeric float value FUNCTION_FLOAT_LITERAL, - // Numeric int value + /// Numeric int value FUNCTION_INT_LITERAL, - // Column identifier + /// Column identifier FUNCTION_IDENTIFIER, - // Unknown, can be any value + /// Unknown, can be any value FUNCTION_UNKNOWN, - // (0.1, ...., 0.1) vector without word 'tuple' + /// (0.1, ...., 0.1) vector without word 'tuple' FUNCTION_LITERAL_TUPLE, - // [0.1, ...., 0.1] vector without word 'array' + /// [0.1, ...., 0.1] vector without word 'array' FUNCTION_LITERAL_ARRAY, - // if client parameters are used, cast will always be in the query + /// if client parameters are used, cast will always be in the query FUNCTION_CAST, - // name of type in cast function + /// name of type in cast function FUNCTION_STRING_LITERAL, }; @@ -178,33 +170,33 @@ private: bool checkQueryStructure(const SelectQueryInfo & query); - // Util functions for the traversal of AST, parses AST and builds rpn + /// Util functions for the traversal of AST, parses AST and builds rpn void traverseAST(const ASTPtr & node, RPN & rpn); - // Return true if we can identify our node type + /// Return true if we can identify our node type bool traverseAtomAST(const ASTPtr & node, RPNElement & out); - // Checks if the AST stores ConstType expression + /// Checks if the AST stores ConstType expression bool tryCastToConstType(const ASTPtr & node, RPNElement & out); - // Traverses the AST of ORDERBY section + /// Traverses the AST of ORDERBY section void traverseOrderByAST(const ASTPtr & node, RPN & rpn); - // Returns true and stores ANNExpr if the query has valid WHERE section + /// Returns true and stores ANNExpr if the query has valid WHERE section static bool matchRPNWhere(RPN & rpn, ApproximateNearestNeighborInformation & ann_info); - // Returns true and stores ANNExpr if the query has valid ORDERBY section + /// Returns true and stores ANNExpr if the query has valid ORDERBY section static bool matchRPNOrderBy(RPN & rpn, ApproximateNearestNeighborInformation & ann_info); - // Returns true and stores Length if we have valid LIMIT clause in query + /// Returns true and stores Length if we have valid LIMIT clause in query static bool matchRPNLimit(RPNElement & rpn, UInt64 & limit); /* Matches dist function, reference vector, column name */ static bool matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ApproximateNearestNeighborInformation & ann_info); - // Gets float or int from AST node + /// Gets float or int from AST node static float getFloatOrIntLiteralOrPanic(const RPN::iterator& iter); Block block_with_constants; - // true if we have one of two supported query types + /// true if we have one of two supported query types std::optional query_information; // Get from settings ANNIndex parameters @@ -214,10 +206,12 @@ private: bool index_is_useful = false; }; -// condition interface for Ann indexes. Returns vector of indexes of ranges in granule which are useful for query. + +/// Common interface of ANN indexes. class IMergeTreeIndexConditionApproximateNearestNeighbor : public IMergeTreeIndexCondition { public: + /// Returns vector of indexes of ranges in granule which are useful for query. virtual std::vector getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const = 0; }; From 6580d2c326021cf476a274bce79b0b5d82bb92e4 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 21:52:14 +0000 Subject: [PATCH 0444/1072] Cosmetics: castMetricFromStringToType --> stringToMetric --- src/Storages/MergeTree/CommonANNIndexes.cpp | 24 +-- src/Storages/MergeTree/CommonANNIndexes.h | 2 +- .../queries/0_stateless/02354_annoy.reference | 26 ---- tests/queries/0_stateless/02354_annoy.sh | 143 ------------------ 4 files changed, 14 insertions(+), 181 deletions(-) delete mode 100644 tests/queries/0_stateless/02354_annoy.reference delete mode 100755 tests/queries/0_stateless/02354_annoy.sh diff --git a/src/Storages/MergeTree/CommonANNIndexes.cpp b/src/Storages/MergeTree/CommonANNIndexes.cpp index f4a0e9bf728..2e2eb4e19ea 100644 --- a/src/Storages/MergeTree/CommonANNIndexes.cpp +++ b/src/Storages/MergeTree/CommonANNIndexes.cpp @@ -41,30 +41,32 @@ void extraceReferenceVectorFromLiteral(ApproximateNearestNeighborInformation::Em } } -ApproximateNearestNeighborInformation::Metric castMetricFromStringToType(String metric_name) +ApproximateNearestNeighborInformation::Metric stringToMetric(std::string_view metric) { - if (metric_name == "L2Distance") + if (metric == "L2Distance") return ApproximateNearestNeighborInformation::Metric::L2; - if (metric_name == "LpDistance") + else if (metric == "LpDistance") return ApproximateNearestNeighborInformation::Metric::Lp; - return ApproximateNearestNeighborInformation::Metric::Unknown; + else + return ApproximateNearestNeighborInformation::Metric::Unknown; } } ApproximateNearestNeighborCondition::ApproximateNearestNeighborCondition(const SelectQueryInfo & query_info, ContextPtr context) : - block_with_constants{KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)}, - index_granularity{context->getMergeTreeSettings().get("index_granularity").get()}, - limit_restriction{context->getSettings().get("max_limit_for_ann_queries").get()}, - index_is_useful{checkQueryStructure(query_info)} {} + block_with_constants(KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)), + index_granularity(context->getMergeTreeSettings().get("index_granularity").get()), + limit_restriction(context->getSettings().get("max_limit_for_ann_queries").get()), + index_is_useful(checkQueryStructure(query_info)) +{} -bool ApproximateNearestNeighborCondition::alwaysUnknownOrTrue(String metric_name) const +bool ApproximateNearestNeighborCondition::alwaysUnknownOrTrue(String metric) const { if (!index_is_useful) return true; // Query isn't supported // If query is supported, check metrics for match - return !(castMetricFromStringToType(metric_name) == query_information->metric); + return !(stringToMetric(metric) == query_information->metric); } float ApproximateNearestNeighborCondition::getComparisonDistanceForWhereQuery() const @@ -415,7 +417,7 @@ bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, c if (iter->function != RPNElement::FUNCTION_DISTANCE) return false; - ann_info.metric = castMetricFromStringToType(iter->func_name); + ann_info.metric = stringToMetric(iter->func_name); ++iter; if (ann_info.metric == ApproximateNearestNeighborInformation::Metric::Lp) diff --git a/src/Storages/MergeTree/CommonANNIndexes.h b/src/Storages/MergeTree/CommonANNIndexes.h index 2ac9384a884..6b094a40b26 100644 --- a/src/Storages/MergeTree/CommonANNIndexes.h +++ b/src/Storages/MergeTree/CommonANNIndexes.h @@ -83,7 +83,7 @@ public: ContextPtr context); /// Returns false if query can be speeded up by an ANN index, true otherwise. - bool alwaysUnknownOrTrue(String metric_name) const; + bool alwaysUnknownOrTrue(String metric) const; /// Returns the distance to compare with for search query float getComparisonDistanceForWhereQuery() const; diff --git a/tests/queries/0_stateless/02354_annoy.reference b/tests/queries/0_stateless/02354_annoy.reference deleted file mode 100644 index 38678fb67c9..00000000000 --- a/tests/queries/0_stateless/02354_annoy.reference +++ /dev/null @@ -1,26 +0,0 @@ -1 [0,0,10] -2 [0,0,10.5] -3 [0,0,9.5] -4 [0,0,9.7] -5 [0,0,10.2] -1 [0,0,10] -5 [0,0,10.2] -4 [0,0,9.7] -1 [0,0,10] -2 [0,0,10.5] -3 [0,0,9.5] -4 [0,0,9.7] -5 [0,0,10.2] -1 [0,0,10] -5 [0,0,10.2] -4 [0,0,9.7] - Name: annoy_index - Name: annoy_index -1 [0,0,10] -2 [0.2,0,10] -3 [-0.3,0,10] -1 [0,0,10] -2 [0.2,0,10] -3 [-0.3,0,10] - Name: annoy_index - Name: annoy_index diff --git a/tests/queries/0_stateless/02354_annoy.sh b/tests/queries/0_stateless/02354_annoy.sh deleted file mode 100755 index 1031ea81946..00000000000 --- a/tests/queries/0_stateless/02354_annoy.sh +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/env bash -# Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-upgrade-check - -CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CURDIR"/../shell_config.sh - -# Check that index works correctly for L2Distance and with client parameters -$CLICKHOUSE_CLIENT -nm --allow_experimental_annoy_index=1 -q " -DROP TABLE IF EXISTS 02354_annoy_l2; - -CREATE TABLE 02354_annoy_l2 -( - id Int32, - embedding Array(Float32), - INDEX annoy_index embedding TYPE annoy() GRANULARITY 1 -) -ENGINE = MergeTree -ORDER BY id -SETTINGS index_granularity=5, index_granularity_bytes = '10Mi'; - -INSERT INTO 02354_annoy_l2 VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]); - -SELECT * -FROM 02354_annoy_l2 -WHERE L2Distance(embedding, [0.0, 0.0, 10.0]) < 1.0 -LIMIT 5; - -SELECT * -FROM 02354_annoy_l2 -ORDER BY L2Distance(embedding, [0.0, 0.0, 10.0]) -LIMIT 3; - -SET param_02354_target_vector='[0.0, 0.0, 10.0]'; - -SELECT * -FROM 02354_annoy_l2 -WHERE L2Distance(embedding, {02354_target_vector: Array(Float32)}) < 1.0 -LIMIT 5; - -SELECT * -FROM 02354_annoy_l2 -ORDER BY L2Distance(embedding, {02354_target_vector: Array(Float32)}) -LIMIT 3; - -SELECT * -FROM 02354_annoy_l2 -ORDER BY L2Distance(embedding, [0.0, 0.0]) -LIMIT 3; -- { serverError 80 } - - -DROP TABLE IF EXISTS 02354_annoy_l2; -" - -# Check that indexes are used -$CLICKHOUSE_CLIENT -nm --allow_experimental_annoy_index=1 -q " -DROP TABLE IF EXISTS 02354_annoy_l2; - -CREATE TABLE 02354_annoy_l2 -( - id Int32, - embedding Array(Float32), - INDEX annoy_index embedding TYPE annoy() GRANULARITY 1 -) -ENGINE = MergeTree -ORDER BY id -SETTINGS index_granularity=5, index_granularity_bytes = '10Mi'; - -INSERT INTO 02354_annoy_l2 VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]); - -EXPLAIN indexes=1 -SELECT * -FROM 02354_annoy_l2 -WHERE L2Distance(embedding, [0.0, 0.0, 10.0]) < 1.0 -LIMIT 5; - -EXPLAIN indexes=1 -SELECT * -FROM 02354_annoy_l2 -ORDER BY L2Distance(embedding, [0.0, 0.0, 10.0]) -LIMIT 3; -DROP TABLE IF EXISTS 02354_annoy_l2; -" | grep "annoy_index" - - -# # Check that index works correctly for cosineDistance -$CLICKHOUSE_CLIENT -nm --allow_experimental_annoy_index=1 -q " -DROP TABLE IF EXISTS 02354_annoy_cosine; - -CREATE TABLE 02354_annoy_cosine -( - id Int32, - embedding Array(Float32), - INDEX annoy_index embedding TYPE annoy('cosineDistance', 100) GRANULARITY 1 -) -ENGINE = MergeTree -ORDER BY id -SETTINGS index_granularity=5, index_granularity_bytes = '10Mi'; - -INSERT INTO 02354_annoy_cosine VALUES (1, [0.0, 0.0, 10.0]), (2, [0.2, 0.0, 10.0]), (3, [-0.3, 0.0, 10.0]), (4, [0.5, 0.0, 10.1]), (5, [0.8, 0.0, 10.0]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]); - -SELECT * -FROM 02354_annoy_cosine -WHERE cosineDistance(embedding, [0.0, 0.0, 10.0]) < 1.0 -LIMIT 3; - -SELECT * -FROM 02354_annoy_cosine -ORDER BY cosineDistance(embedding, [0.0, 0.0, 10.0]) -LIMIT 3; - -DROP TABLE IF EXISTS 02354_annoy_cosine; -" - -# # Check that indexes are used -$CLICKHOUSE_CLIENT -nm --allow_experimental_annoy_index=1 -q " -DROP TABLE IF EXISTS 02354_annoy_cosine; - -CREATE TABLE 02354_annoy_cosine -( - id Int32, - embedding Array(Float32), - INDEX annoy_index embedding TYPE annoy('cosineDistance', 100) GRANULARITY 1 -) -ENGINE = MergeTree -ORDER BY id -SETTINGS index_granularity=5, index_granularity_bytes = '10Mi'; - -INSERT INTO 02354_annoy_cosine VALUES (1, [0.0, 0.0, 10.0]), (2, [0.2, 0.0, 10.0]), (3, [-0.3, 0.0, 10.0]), (4, [0.5, 0.0, 10.1]), (5, [0.8, 0.0, 10.0]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]); - -EXPLAIN indexes=1 -SELECT * -FROM 02354_annoy_cosine -WHERE cosineDistance(embedding, [0.0, 0.0, 10.0]) < 1.0 -LIMIT 3; - -EXPLAIN indexes=1 -SELECT * -FROM 02354_annoy_cosine -ORDER BY cosineDistance(embedding, [0.0, 0.0, 10.0]) -LIMIT 3; -DROP TABLE IF EXISTS 02354_annoy_cosine; -" | grep "annoy_index" From ee5b49c3fd71fddae0ca1d0e2ee96f3dadfb82f1 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 21:57:17 +0000 Subject: [PATCH 0445/1072] Consolidate Annoy index tests --- .../0_stateless/02354_annoy_index.reference | 41 +++++++++++++++++++ .../queries/0_stateless/02354_annoy_index.sql | 34 +++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/tests/queries/0_stateless/02354_annoy_index.reference b/tests/queries/0_stateless/02354_annoy_index.reference index 7da442cb905..5f3b523fbe4 100644 --- a/tests/queries/0_stateless/02354_annoy_index.reference +++ b/tests/queries/0_stateless/02354_annoy_index.reference @@ -1,3 +1,44 @@ +WHERE type, L2Distance +1 [0,0,10] +2 [0,0,10.5] +3 [0,0,9.5] +4 [0,0,9.7] +5 [0,0,10.2] +ORDER BY type, L2Distance +1 [0,0,10] +5 [0,0,10.2] +4 [0,0,9.7] +Reference ARRAYs with non-matching dimension are rejected +WHERE type, L2Distance, check that index is used +Expression ((Projection + Before ORDER BY)) + Limit (preliminary LIMIT (without OFFSET)) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 + Skip + Name: annoy_index + Description: annoy GRANULARITY 1 + Parts: 1/1 + Granules: 1/1 +ORDER BY type, L2Distance, check that index is used +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 + Skip + Name: annoy_index + Description: annoy GRANULARITY 1 + Parts: 1/1 + Granules: 1/1 parameter annoy_index_search_k_nodes parameter max_limit_for_ann_queries Expression (Projection) diff --git a/tests/queries/0_stateless/02354_annoy_index.sql b/tests/queries/0_stateless/02354_annoy_index.sql index 3590b7d316e..3e1c176f3f6 100644 --- a/tests/queries/0_stateless/02354_annoy_index.sql +++ b/tests/queries/0_stateless/02354_annoy_index.sql @@ -9,6 +9,40 @@ CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding -- SETTINGS index_granularity=5, index_granularity_bytes = '10Mi'; INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]); +SELECT 'WHERE type, L2Distance'; +SELECT * +FROM tab +WHERE L2Distance(embedding, [0.0, 0.0, 10.0]) < 1.0 +LIMIT 5; + +SELECT 'ORDER BY type, L2Distance'; +SELECT * +FROM tab +ORDER BY L2Distance(embedding, [0.0, 0.0, 10.0]) +LIMIT 3; + + +SELECT 'Reference ARRAYs with non-matching dimension are rejected'; +SELECT * +FROM tab +ORDER BY L2Distance(embedding, [0.0, 0.0]) +LIMIT 3; -- { serverError INCORRECT_QUERY } + + +SELECT 'WHERE type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +WHERE L2Distance(embedding, [0.0, 0.0, 10.0]) < 1.0 +LIMIT 5; + +SELECT 'ORDER BY type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +ORDER BY L2Distance(embedding, [0.0, 0.0, 10.0]) +LIMIT 3; + SELECT 'parameter annoy_index_search_k_nodes'; SELECT * FROM tab From 55256f4664c6226174fc249a652336b8d14a5251 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 25 May 2023 22:12:54 +0000 Subject: [PATCH 0446/1072] Cosmetics: Fix typo --- src/Storages/MergeTree/CommonANNIndexes.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/CommonANNIndexes.cpp b/src/Storages/MergeTree/CommonANNIndexes.cpp index 2e2eb4e19ea..45d86e54ea2 100644 --- a/src/Storages/MergeTree/CommonANNIndexes.cpp +++ b/src/Storages/MergeTree/CommonANNIndexes.cpp @@ -25,7 +25,7 @@ namespace { template -void extraceReferenceVectorFromLiteral(ApproximateNearestNeighborInformation::Embedding & reference_vector, Literal literal) +void extractReferenceVectorFromLiteral(ApproximateNearestNeighborInformation::Embedding & reference_vector, Literal literal) { Float64 float_element_of_reference_vector; Int64 int_element_of_reference_vector; @@ -441,13 +441,13 @@ bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, c if (iter->function == RPNElement::FUNCTION_LITERAL_TUPLE) { - extraceReferenceVectorFromLiteral(ann_info.reference_vector, iter->tuple_literal); + extractReferenceVectorFromLiteral(ann_info.reference_vector, iter->tuple_literal); ++iter; } if (iter->function == RPNElement::FUNCTION_LITERAL_ARRAY) { - extraceReferenceVectorFromLiteral(ann_info.reference_vector, iter->array_literal); + extractReferenceVectorFromLiteral(ann_info.reference_vector, iter->array_literal); ++iter; } @@ -462,12 +462,12 @@ bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, c ++iter; if (iter->function == RPNElement::FUNCTION_LITERAL_TUPLE) { - extraceReferenceVectorFromLiteral(ann_info.reference_vector, iter->tuple_literal); + extractReferenceVectorFromLiteral(ann_info.reference_vector, iter->tuple_literal); ++iter; } else if (iter->function == RPNElement::FUNCTION_LITERAL_ARRAY) { - extraceReferenceVectorFromLiteral(ann_info.reference_vector, iter->array_literal); + extractReferenceVectorFromLiteral(ann_info.reference_vector, iter->array_literal); ++iter; } else From 879b70a594fdc42421f04ff1f757a518b67c6956 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 26 May 2023 08:50:36 +0000 Subject: [PATCH 0447/1072] CommonANNIndexes.h/cpp --> ApproximateNearestNeighborIndexesCommon.h/cpp --- ...es.cpp => ApproximateNearestNeighborIndexesCommon.cpp} | 8 +++----- ...ndexes.h => ApproximateNearestNeighborIndexesCommon.h} | 0 src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp | 2 +- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) rename src/Storages/MergeTree/{CommonANNIndexes.cpp => ApproximateNearestNeighborIndexesCommon.cpp} (99%) rename src/Storages/MergeTree/{CommonANNIndexes.h => ApproximateNearestNeighborIndexesCommon.h} (100%) diff --git a/src/Storages/MergeTree/CommonANNIndexes.cpp b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp similarity index 99% rename from src/Storages/MergeTree/CommonANNIndexes.cpp rename to src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp index 45d86e54ea2..4f0bcd7ff81 100644 --- a/src/Storages/MergeTree/CommonANNIndexes.cpp +++ b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp @@ -1,17 +1,15 @@ -#include -#include +#include +#include #include #include #include #include #include #include - +#include #include -#include - namespace DB { diff --git a/src/Storages/MergeTree/CommonANNIndexes.h b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h similarity index 100% rename from src/Storages/MergeTree/CommonANNIndexes.h rename to src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 3d290ea12ac..fb11fabfac1 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -45,7 +45,7 @@ #include -#include +#include namespace CurrentMetrics { diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index 95041ea31fb..0ff676cf11f 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -2,7 +2,7 @@ #ifdef ENABLE_ANNOY -#include +#include #include #include From 8213e366fae75c7079f814944d163e2e5d9b9b54 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 26 May 2023 08:57:35 +0000 Subject: [PATCH 0448/1072] Cosmetics: Remove absolute namespace qualification of Annoy library internals, pt. II --- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index 0ff676cf11f..2c78ff536ae 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -11,9 +11,9 @@ namespace DB { template -class AnnoyIndexWithSerialization : public ::Annoy::AnnoyIndex +class AnnoyIndexWithSerialization : public Annoy::AnnoyIndex { - using Base = ::Annoy::AnnoyIndex; + using Base = Annoy::AnnoyIndex; public: explicit AnnoyIndexWithSerialization(uint64_t dim); From 0c26123fd5cf1b848a14eff86d0e5a97461d4c16 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 26 May 2023 09:39:06 +0000 Subject: [PATCH 0449/1072] Cosmetics: limit_restriction --> max_limit_for_ann_queries --- .../ApproximateNearestNeighborIndexesCommon.cpp | 13 ++++++------- .../ApproximateNearestNeighborIndexesCommon.h | 5 ++--- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp index 4f0bcd7ff81..252035f3335 100644 --- a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp +++ b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp @@ -51,12 +51,11 @@ ApproximateNearestNeighborInformation::Metric stringToMetric(std::string_view me } -ApproximateNearestNeighborCondition::ApproximateNearestNeighborCondition(const SelectQueryInfo & query_info, - ContextPtr context) : - block_with_constants(KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)), - index_granularity(context->getMergeTreeSettings().get("index_granularity").get()), - limit_restriction(context->getSettings().get("max_limit_for_ann_queries").get()), - index_is_useful(checkQueryStructure(query_info)) +ApproximateNearestNeighborCondition::ApproximateNearestNeighborCondition(const SelectQueryInfo & query_info, ContextPtr context) + : block_with_constants(KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)) + , index_granularity(context->getMergeTreeSettings().get("index_granularity").get()) + , max_limit_for_ann_queries(context->getSettings().get("max_limit_for_ann_queries").get()) + , index_is_useful(checkQueryStructure(query_info)) {} bool ApproximateNearestNeighborCondition::alwaysUnknownOrTrue(String metric) const @@ -167,7 +166,7 @@ bool ApproximateNearestNeighborCondition::checkQueryStructure(const SelectQueryI const bool limit_is_valid = matchRPNLimit(rpn_limit, limit); /// Query without a LIMIT clause or with a limit greater than a restriction is not supported - if (!limit_is_valid || limit_restriction < limit) + if (!limit_is_valid || max_limit_for_ann_queries < limit) return false; /// Search type query in both sections isn't supported diff --git a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h index 6b094a40b26..513a86c42d8 100644 --- a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h +++ b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h @@ -79,8 +79,7 @@ struct ApproximateNearestNeighborInformation class ApproximateNearestNeighborCondition { public: - ApproximateNearestNeighborCondition(const SelectQueryInfo & query_info, - ContextPtr context); + ApproximateNearestNeighborCondition(const SelectQueryInfo & query_info, ContextPtr context); /// Returns false if query can be speeded up by an ANN index, true otherwise. bool alwaysUnknownOrTrue(String metric) const; @@ -202,7 +201,7 @@ private: // Get from settings ANNIndex parameters const UInt64 index_granularity; /// only queries with a lower limit can be considered to avoid memory overflow - const UInt64 limit_restriction; + const UInt64 max_limit_for_ann_queries; bool index_is_useful = false; }; From 146668a850234394d34c03c2d0246bbed7855549 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 29 May 2023 15:34:43 +0000 Subject: [PATCH 0450/1072] Cosmetics: query_type --> type --- .../ApproximateNearestNeighborIndexesCommon.cpp | 8 ++++---- .../ApproximateNearestNeighborIndexesCommon.h | 10 ++++++++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp index 252035f3335..c47e53788a7 100644 --- a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp +++ b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp @@ -69,7 +69,7 @@ bool ApproximateNearestNeighborCondition::alwaysUnknownOrTrue(String metric) con float ApproximateNearestNeighborCondition::getComparisonDistanceForWhereQuery() const { if (index_is_useful && query_information.has_value() - && query_information->query_type == ApproximateNearestNeighborInformation::Type::Where) + && query_information->type == ApproximateNearestNeighborInformation::Type::Where) return query_information->distance; throw Exception(ErrorCodes::LOGICAL_ERROR, "Not supported method for this query type"); } @@ -119,7 +119,7 @@ float ApproximateNearestNeighborCondition::getPValueForLpDistance() const ApproximateNearestNeighborInformation::Type ApproximateNearestNeighborCondition::getQueryType() const { if (index_is_useful && query_information.has_value()) - return query_information->query_type; + return query_information->type; throw Exception(ErrorCodes::LOGICAL_ERROR, "Query type was requested for useless or uninitialized index."); } @@ -326,7 +326,7 @@ void ApproximateNearestNeighborCondition::traverseOrderByAST(const ASTPtr & node bool ApproximateNearestNeighborCondition::matchRPNWhere(RPN & rpn, ApproximateNearestNeighborInformation & ann_info) { /// Fill query type field - ann_info.query_type = ApproximateNearestNeighborInformation::Type::Where; + ann_info.type = ApproximateNearestNeighborInformation::Type::Where; /// WHERE section must have at least 5 expressions /// Operator->Distance(float)->DistanceFunc->Column->Tuple(Array)Func(ReferenceVector(floats)) @@ -381,7 +381,7 @@ bool ApproximateNearestNeighborCondition::matchRPNWhere(RPN & rpn, ApproximateNe bool ApproximateNearestNeighborCondition::matchRPNOrderBy(RPN & rpn, ApproximateNearestNeighborInformation & ann_info) { /// Fill query type field - ann_info.query_type = ApproximateNearestNeighborInformation::Type::OrderBy; + ann_info.type = ApproximateNearestNeighborInformation::Type::OrderBy; // ORDER BY clause must have at least 3 expressions if (rpn.size() < 3) diff --git a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h index 513a86c42d8..4fb95c3f492 100644 --- a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h +++ b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h @@ -42,7 +42,7 @@ struct ApproximateNearestNeighborInformation OrderBy, Where }; - Type query_type; + Type type; float p_for_lp_dist = -1.0; float distance = -1.0; @@ -150,7 +150,11 @@ private: }; explicit RPNElement(Function function_ = FUNCTION_UNKNOWN) - : function(function_), func_name("Unknown"), float_literal(std::nullopt), identifier(std::nullopt) {} + : function(function_) + , func_name("Unknown") + , float_literal(std::nullopt) + , identifier(std::nullopt) + {} Function function; String func_name; @@ -200,8 +204,10 @@ private: // Get from settings ANNIndex parameters const UInt64 index_granularity; + /// only queries with a lower limit can be considered to avoid memory overflow const UInt64 max_limit_for_ann_queries; + bool index_is_useful = false; }; From c94ec9f5ddb8d047c370749f3249374fb746ed06 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 29 May 2023 15:35:14 +0000 Subject: [PATCH 0451/1072] Cosmetics: Fix typo --- src/Core/Settings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 3e10f48a2fb..df4cf5ff087 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -741,7 +741,7 @@ class IColumn; M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions (hashid, etc)", 0) \ M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \ - M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexs.", 0) \ + M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \ M(Int64, annoy_index_search_k_nodes, -1, "SELECT queries search up to this many nodes in Annoy indexes.", 0) \ M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \ M(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, "Wait for committed changes to become actually visible in the latest snapshot", 0) \ From 4e9a5331781166160feaf71ca8944a4f1b733846 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 2 Jun 2023 09:53:20 +0000 Subject: [PATCH 0452/1072] Cosmetics: condition --> ann_condition --- .../MergeTree/MergeTreeIndexAnnoy.cpp | 20 +++++++++---------- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 2 +- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 9dcfd421ba5..9afaba6e521 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -208,12 +208,11 @@ MergeTreeIndexConditionAnnoy::MergeTreeIndexConditionAnnoy( const SelectQueryInfo & query, const String & distance_function_, ContextPtr context) - : condition(query, context) + : ann_condition(query, context) , distance_function(distance_function_) , search_k(context->getSettings().get("annoy_index_search_k_nodes").get()) {} - bool MergeTreeIndexConditionAnnoy::mayBeTrueOnGranule(MergeTreeIndexGranulePtr /*idx_granule*/) const { throw Exception(ErrorCodes::LOGICAL_ERROR, "mayBeTrueOnGranule is not supported for ANN skip indexes"); @@ -221,7 +220,7 @@ bool MergeTreeIndexConditionAnnoy::mayBeTrueOnGranule(MergeTreeIndexGranulePtr / bool MergeTreeIndexConditionAnnoy::alwaysUnknownOrTrue() const { - return condition.alwaysUnknownOrTrue(distance_function); + return ann_condition.alwaysUnknownOrTrue(distance_function); } std::vector MergeTreeIndexConditionAnnoy::getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const @@ -234,20 +233,19 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRanges(MergeTreeIndex throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_function); } - template std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const { - UInt64 limit = condition.getLimit(); - UInt64 index_granularity = condition.getIndexGranularity(); - std::optional comp_dist = condition.getQueryType() == ApproximateNearestNeighborInformation::Type::Where - ? std::optional(condition.getComparisonDistanceForWhereQuery()) + UInt64 limit = ann_condition.getLimit(); + UInt64 index_granularity = ann_condition.getIndexGranularity(); + std::optional comp_dist = ann_condition.getQueryType() == ApproximateNearestNeighborInformation::Type::Where + ? std::optional(ann_condition.getComparisonDistanceForWhereQuery()) : std::nullopt; if (comp_dist && comp_dist.value() < 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to optimize query with where without distance"); - std::vector reference_vector = condition.getReferenceVector(); + std::vector reference_vector = ann_condition.getReferenceVector(); auto granule = std::dynamic_pointer_cast>(idx_granule); if (granule == nullptr) @@ -255,10 +253,10 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI auto annoy = granule->index; - if (condition.getNumOfDimensions() != annoy->getNumOfDimensions()) + if (ann_condition.getNumOfDimensions() != annoy->getNumOfDimensions()) throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) " "does not match with the dimension in the index ({})", - toString(condition.getNumOfDimensions()), toString(annoy->getNumOfDimensions())); + toString(ann_condition.getNumOfDimensions()), toString(annoy->getNumOfDimensions())); /// neighbors contain indexes of dots which were closest to the reference vector std::vector neighbors; diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index 2c78ff536ae..457a505d909 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -81,7 +81,7 @@ private: template std::vector getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const; - const ApproximateNearestNeighborCondition condition; + const ApproximateNearestNeighborCondition ann_condition; const String distance_function; const Int64 search_k; }; From 397715bfa501e89de741e2b8edfbafdd0cede707 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 2 Jun 2023 09:55:37 +0000 Subject: [PATCH 0453/1072] Cosmetics: comp_dist --> comparison_distance --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 9afaba6e521..7808f07511b 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -238,11 +238,11 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI { UInt64 limit = ann_condition.getLimit(); UInt64 index_granularity = ann_condition.getIndexGranularity(); - std::optional comp_dist = ann_condition.getQueryType() == ApproximateNearestNeighborInformation::Type::Where + std::optional comparison_distance = ann_condition.getQueryType() == ApproximateNearestNeighborInformation::Type::Where ? std::optional(ann_condition.getComparisonDistanceForWhereQuery()) : std::nullopt; - if (comp_dist && comp_dist.value() < 0) + if (comparison_distance && comparison_distance.value() < 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to optimize query with where without distance"); std::vector reference_vector = ann_condition.getReferenceVector(); @@ -269,7 +269,7 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI std::unordered_set granule_numbers; for (size_t i = 0; i < neighbors.size(); ++i) { - if (comp_dist && distances[i] > comp_dist) + if (comparison_distance && distances[i] > comparison_distance) continue; granule_numbers.insert(neighbors[i] / index_granularity); } From 06329fb08b8418e347d5c64a37beeb51d5504376 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 5 Jun 2023 08:35:59 +0000 Subject: [PATCH 0454/1072] Minor: Make unique by sort/erase (should be a bit faster) --- .../MergeTree/MergeTreeIndexAnnoy.cpp | 31 +++++++++---------- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 2 +- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 7808f07511b..d1715b2c4c1 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -148,8 +148,8 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t if (index_sample_block.columns() > 1) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected block with single column"); - auto index_column_name = index_sample_block.getByPosition(0).name; - const auto & column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read); + const String & index_column_name = index_sample_block.getByPosition(0).name; + ColumnPtr column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read); if (const auto & column_array = typeid_cast(column_cut.get())) { @@ -204,7 +204,7 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t MergeTreeIndexConditionAnnoy::MergeTreeIndexConditionAnnoy( - const IndexDescription & /*index*/, + const IndexDescription & /*index_description*/, const SelectQueryInfo & query, const String & distance_function_, ContextPtr context) @@ -251,35 +251,34 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI if (granule == nullptr) throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule has the wrong type"); - auto annoy = granule->index; + const AnnoyIndexWithSerializationPtr & annoy = granule->index; if (ann_condition.getNumOfDimensions() != annoy->getNumOfDimensions()) throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) " - "does not match with the dimension in the index ({})", - toString(ann_condition.getNumOfDimensions()), toString(annoy->getNumOfDimensions())); + "does not match the dimension in the index ({})", + ann_condition.getNumOfDimensions(), annoy->getNumOfDimensions()); - /// neighbors contain indexes of dots which were closest to the reference vector - std::vector neighbors; + std::vector neighbors; /// indexes of dots which were closest to the reference vector std::vector distances; neighbors.reserve(limit); distances.reserve(limit); annoy->get_nns_by_vector(reference_vector.data(), limit, static_cast(search_k), &neighbors, &distances); - std::unordered_set granule_numbers; + std::vector granule_numbers; + granule_numbers.reserve(neighbors.size()); for (size_t i = 0; i < neighbors.size(); ++i) { if (comparison_distance && distances[i] > comparison_distance) continue; - granule_numbers.insert(neighbors[i] / index_granularity); + granule_numbers.push_back(neighbors[i] / index_granularity); } - std::vector result_vector; - result_vector.reserve(granule_numbers.size()); - for (auto granule_number : granule_numbers) - result_vector.push_back(granule_number); + /// make unique + std::sort(granule_numbers.begin(), granule_numbers.end()); + granule_numbers.erase(std::unique(granule_numbers.begin(), granule_numbers.end()), granule_numbers.end()); - return result_vector; + return granule_numbers; } MergeTreeIndexAnnoy::MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t trees_, const String & distance_function_) @@ -302,7 +301,7 @@ MergeTreeIndexAggregatorPtr MergeTreeIndexAnnoy::createIndexAggregator() const /// TODO: Support more metrics. Available metrics: https://github.com/spotify/annoy/blob/master/src/annoymodule.cc#L151-L171 if (distance_function == "L2Distance") return std::make_shared>(index.name, index.sample_block, trees); - if (distance_function == "cosineDistance") + else if (distance_function == "cosineDistance") return std::make_shared>(index.name, index.sample_block, trees); std::unreachable(); } diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index 457a505d909..bca06edd0f8 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -64,7 +64,7 @@ class MergeTreeIndexConditionAnnoy final : public IMergeTreeIndexConditionApprox { public: MergeTreeIndexConditionAnnoy( - const IndexDescription & index, + const IndexDescription & index_description, const SelectQueryInfo & query, const String& distance_function, ContextPtr context); From 22110ac7427abc4feac120a1cfa5cd29850def8b Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 5 Jun 2023 11:59:49 +0200 Subject: [PATCH 0455/1072] Fix exception message --- src/Interpreters/Cache/Metadata.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/Cache/Metadata.cpp b/src/Interpreters/Cache/Metadata.cpp index 9dff77e2af8..d0780202121 100644 --- a/src/Interpreters/Cache/Metadata.cpp +++ b/src/Interpreters/Cache/Metadata.cpp @@ -374,7 +374,7 @@ KeyMetadata::iterator LockedKey::removeFileSegment(size_t offset, const FileSegm fs::remove(path); } else if (file_segment->downloaded_size) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected path {} to exist"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected path {} to exist", path); file_segment->detach(segment_lock, *this); return key_metadata->erase(it); From 3082029406846dd92f1d9018156fc0bf9fee8d7c Mon Sep 17 00:00:00 2001 From: ismailakpolat Date: Mon, 5 Jun 2023 13:03:17 +0300 Subject: [PATCH 0456/1072] Update rabbitmq.md Duplicate parameter name in definition --- docs/en/engines/table-engines/integrations/rabbitmq.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/en/engines/table-engines/integrations/rabbitmq.md b/docs/en/engines/table-engines/integrations/rabbitmq.md index 08062278904..7620cd22767 100644 --- a/docs/en/engines/table-engines/integrations/rabbitmq.md +++ b/docs/en/engines/table-engines/integrations/rabbitmq.md @@ -42,7 +42,6 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] [rabbitmq_queue_consume = false,] [rabbitmq_address = '',] [rabbitmq_vhost = '/',] - [rabbitmq_queue_consume = false,] [rabbitmq_username = '',] [rabbitmq_password = '',] [rabbitmq_commit_on_select = false,] From a224c8936ccebc243ab82b7338ddd93a10f3c099 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Mon, 5 Jun 2023 10:18:07 +0000 Subject: [PATCH 0457/1072] Fix minor issues in documentation --- docs/en/operations/system-tables/processes.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/en/operations/system-tables/processes.md b/docs/en/operations/system-tables/processes.md index 2e729920ed0..ffa37357053 100644 --- a/docs/en/operations/system-tables/processes.md +++ b/docs/en/operations/system-tables/processes.md @@ -10,14 +10,14 @@ Columns: - `user` (String) – The user who made the query. Keep in mind that for distributed processing, queries are sent to remote servers under the `default` user. The field contains the username for a specific query, not for a query that this query initiated. - `address` (String) – The IP address the request was made from. The same for distributed processing. To track where a distributed query was originally made from, look at `system.processes` on the query requestor server. - `elapsed` (Float64) – The time in seconds since request execution started. -- `rows_read` (UInt64) – The number of rows read from the table. For distributed processing, on the requestor server, this is the total for all remote servers. -- `bytes_read` (UInt64) – The number of uncompressed bytes read from the table. For distributed processing, on the requestor server, this is the total for all remote servers. +- `read_rows` (UInt64) – The number of rows read from the table. For distributed processing, on the requestor server, this is the total for all remote servers. +- `read_bytes` (UInt64) – The number of uncompressed bytes read from the table. For distributed processing, on the requestor server, this is the total for all remote servers. - `total_rows_approx` (UInt64) – The approximation of the total number of rows that should be read. For distributed processing, on the requestor server, this is the total for all remote servers. It can be updated during request processing, when new sources to process become known. -- `memory_usage` (UInt64) – Amount of RAM the request uses. It might not include some types of dedicated memory. See the [max_memory_usage](../../operations/settings/query-complexity.md#settings_max_memory_usage) setting. +- `memory_usage` (Int64) – Amount of RAM the request uses. It might not include some types of dedicated memory. See the [max_memory_usage](../../operations/settings/query-complexity.md#settings_max_memory_usage) setting. - `query` (String) – The query text. For `INSERT`, it does not include the data to insert. - `query_id` (String) – Query ID, if defined. -- `is_cancelled` (Int8) – Was query cancelled. -- `is_all_data_sent` (Int8) – Was all data sent to the client (in other words query had been finished on the server). +- `is_cancelled` (UInt8) – Was query cancelled. +- `is_all_data_sent` (UInt8) – Was all data sent to the client (in other words query had been finished on the server). ```sql SELECT * FROM system.processes LIMIT 10 FORMAT Vertical; From 256f713d6b75bf971203f24bb6db0fcab6fa9aec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Mon, 5 Jun 2023 10:18:25 +0000 Subject: [PATCH 0458/1072] Add docs for `system.user_processes` --- .../system-tables/user_processes.md | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 docs/en/operations/system-tables/user_processes.md diff --git a/docs/en/operations/system-tables/user_processes.md b/docs/en/operations/system-tables/user_processes.md new file mode 100644 index 00000000000..a9b97390ed6 --- /dev/null +++ b/docs/en/operations/system-tables/user_processes.md @@ -0,0 +1,28 @@ +--- +slug: /en/operations/system-tables/user_processes +--- +# user_processes + +This system table is used for implementing the `SHOW USER PROCESSES` query. + +Columns: + +- `user` ([String](../../sql-reference/data-types/string.md)) — User name. +- `memory_usage` ([Int64](../../sql-reference/data-types/int-uint#int-ranges)) – Sum of RAM used by all processes of the user. It might not include some types of dedicated memory. See the [max_memory_usage](../../operations/settings/query-complexity.md#settings_max_memory_usage) setting. +- `peak_memory_usage` ([Int64](../../sql-reference/data-types/int-uint#int-ranges)) — The peak of memory usage of the user. It can be reset when no queries are run for the user. +- `ProfileEvents` ([Map(String, UInt64)](../../sql-reference/data-types/map)) – Summary of ProfileEvents that measure different metrics for the user. The description of them could be found in the table [system.events](../../operations/system-tables/events.md#system_tables-events) + +```sql +SELECT * FROM system.user_processes LIMIT 10 FORMAT Vertical; +``` + +```response +Row 1: +────── +user: default +memory_usage: 9832 +peak_memory_usage: 9832 +ProfileEvents: {'Query':5,'SelectQuery':5,'QueriesWithSubqueries':38,'SelectQueriesWithSubqueries':38,'QueryTimeMicroseconds':842048,'SelectQueryTimeMicroseconds':842048,'ReadBufferFromFileDescriptorRead':6,'ReadBufferFromFileDescriptorReadBytes':234,'IOBufferAllocs':3,'IOBufferAllocBytes':98493,'ArenaAllocChunks':283,'ArenaAllocBytes':1482752,'FunctionExecute':670,'TableFunctionExecute':16,'DiskReadElapsedMicroseconds':19,'NetworkSendElapsedMicroseconds':684,'NetworkSendBytes':139498,'SelectedRows':6076,'SelectedBytes':685802,'ContextLock':1140,'RWLockAcquiredReadLocks':193,'RWLockReadersWaitMilliseconds':4,'RealTimeMicroseconds':1585163,'UserTimeMicroseconds':889767,'SystemTimeMicroseconds':13630,'SoftPageFaults':1947,'OSCPUWaitMicroseconds':6,'OSCPUVirtualTimeMicroseconds':903251,'OSReadChars':28631,'OSWriteChars':28888,'QueryProfilerRuns':3,'LogTrace':79,'LogDebug':24} + +1 row in set. Elapsed: 0.010 sec. +``` From e140cad10c0e0ab1a3a291b0ceef37a72bcb4295 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Mon, 5 Jun 2023 10:18:36 +0000 Subject: [PATCH 0459/1072] Clean up includes --- src/Storages/System/StorageSystemUserProcesses.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/Storages/System/StorageSystemUserProcesses.cpp b/src/Storages/System/StorageSystemUserProcesses.cpp index 5973f9e2af3..de34fede0ac 100644 --- a/src/Storages/System/StorageSystemUserProcesses.cpp +++ b/src/Storages/System/StorageSystemUserProcesses.cpp @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include @@ -10,8 +9,6 @@ #include #include #include -#include -#include namespace DB From cdb5997339f20f27d6c6ad628c341579ffcf9264 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 5 Jun 2023 10:22:34 +0000 Subject: [PATCH 0460/1072] Cosmetics: add assert --- src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp | 2 -- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index fb11fabfac1..c6b7232be4a 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1693,9 +1693,7 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex( // vector of indexes of useful ranges auto result = ann_condition->getUsefulRanges(granule); if (result.empty()) - { ++granules_dropped; - } for (auto range : result) { diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index d1715b2c4c1..cf7fbb3bab3 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -251,7 +251,7 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI if (granule == nullptr) throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule has the wrong type"); - const AnnoyIndexWithSerializationPtr & annoy = granule->index; + AnnoyIndexWithSerializationPtr annoy = granule->index; if (ann_condition.getNumOfDimensions() != annoy->getNumOfDimensions()) throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) " @@ -265,6 +265,8 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI annoy->get_nns_by_vector(reference_vector.data(), limit, static_cast(search_k), &neighbors, &distances); + chassert(neighbors.size() == distances.size()); + std::vector granule_numbers; granule_numbers.reserve(neighbors.size()); for (size_t i = 0; i < neighbors.size(); ++i) From 7d659fdca2bf80c6f045333f531a8a4f058d33d8 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 5 Jun 2023 10:26:45 +0000 Subject: [PATCH 0461/1072] Tests: Decrease granule size --- tests/queries/0_stateless/02354_annoy_index.reference | 10 +++++----- tests/queries/0_stateless/02354_annoy_index.sql | 7 +------ 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/tests/queries/0_stateless/02354_annoy_index.reference b/tests/queries/0_stateless/02354_annoy_index.reference index 5f3b523fbe4..5b37c32d914 100644 --- a/tests/queries/0_stateless/02354_annoy_index.reference +++ b/tests/queries/0_stateless/02354_annoy_index.reference @@ -17,12 +17,12 @@ Expression ((Projection + Before ORDER BY)) PrimaryKey Condition: true Parts: 1/1 - Granules: 1/1 + Granules: 3/3 Skip Name: annoy_index Description: annoy GRANULARITY 1 Parts: 1/1 - Granules: 1/1 + Granules: 1/3 ORDER BY type, L2Distance, check that index is used Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) @@ -33,12 +33,12 @@ Expression (Projection) PrimaryKey Condition: true Parts: 1/1 - Granules: 1/1 + Granules: 3/3 Skip Name: annoy_index Description: annoy GRANULARITY 1 Parts: 1/1 - Granules: 1/1 + Granules: 3/3 parameter annoy_index_search_k_nodes parameter max_limit_for_ann_queries Expression (Projection) @@ -50,5 +50,5 @@ Expression (Projection) PrimaryKey Condition: true Parts: 1/1 - Granules: 1/1 + Granules: 3/3 Negative tests diff --git a/tests/queries/0_stateless/02354_annoy_index.sql b/tests/queries/0_stateless/02354_annoy_index.sql index 3e1c176f3f6..2c40653667d 100644 --- a/tests/queries/0_stateless/02354_annoy_index.sql +++ b/tests/queries/0_stateless/02354_annoy_index.sql @@ -3,10 +3,7 @@ SET allow_experimental_annoy_index = 1; DROP TABLE IF EXISTS tab; - -DROP TABLE IF EXISTS tab; -CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; --- SETTINGS index_granularity=5, index_granularity_bytes = '10Mi'; +CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity=5; INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]); SELECT 'WHERE type, L2Distance'; @@ -21,14 +18,12 @@ FROM tab ORDER BY L2Distance(embedding, [0.0, 0.0, 10.0]) LIMIT 3; - SELECT 'Reference ARRAYs with non-matching dimension are rejected'; SELECT * FROM tab ORDER BY L2Distance(embedding, [0.0, 0.0]) LIMIT 3; -- { serverError INCORRECT_QUERY } - SELECT 'WHERE type, L2Distance, check that index is used'; EXPLAIN indexes=1 SELECT * From de503dc5d20bb8ddd8816c9639686c2c6089fdbe Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 5 Jun 2023 10:49:45 +0000 Subject: [PATCH 0462/1072] Add tests for tuple + add tests for custom distance metric / tree count --- .../0_stateless/02354_annoy_index.reference | 67 ++++++++++++++- .../queries/0_stateless/02354_annoy_index.sql | 81 +++++++++++++++++-- 2 files changed, 140 insertions(+), 8 deletions(-) diff --git a/tests/queries/0_stateless/02354_annoy_index.reference b/tests/queries/0_stateless/02354_annoy_index.reference index 5b37c32d914..5bd1377d6f4 100644 --- a/tests/queries/0_stateless/02354_annoy_index.reference +++ b/tests/queries/0_stateless/02354_annoy_index.reference @@ -1,3 +1,4 @@ +--- Test with Array --- WHERE type, L2Distance 1 [0,0,10] 2 [0,0,10.5] @@ -51,4 +52,68 @@ Expression (Projection) Condition: true Parts: 1/1 Granules: 3/3 -Negative tests +--- Test with Tuple --- +WHERE type, L2Distance +1 (0,0,10) +2 (0,0,10.5) +3 (0,0,9.5) +4 (0,0,9.7) +5 (0,0,10.2) +ORDER BY type, L2Distance +1 (0,0,10) +5 (0,0,10.2) +4 (0,0,9.7) +WHERE type, L2Distance, check that index is used +Expression ((Projection + Before ORDER BY)) + Limit (preliminary LIMIT (without OFFSET)) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 3/3 + Skip + Name: annoy_index + Description: annoy GRANULARITY 1 + Parts: 1/1 + Granules: 1/3 +ORDER BY type, L2Distance, check that index is used +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 3/3 + Skip + Name: annoy_index + Description: annoy GRANULARITY 1 + Parts: 1/1 + Granules: 3/3 +parameter annoy_index_search_k_nodes +parameter max_limit_for_ann_queries +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 3/3 +--- Test alternative metric (cosine distance) and non-default NumTrees --- +WHERE type, L2Distance +1 [0,0,10] +2 [0,0,10.5] +3 [0,0,9.5] +4 [0,0,9.7] +5 [0,0,10.2] +ORDER BY type, L2Distance +1 [0,0,10] +5 [0,0,10.2] +4 [0,0,9.7] +--- Negative tests --- diff --git a/tests/queries/0_stateless/02354_annoy_index.sql b/tests/queries/0_stateless/02354_annoy_index.sql index 2c40653667d..170c048d420 100644 --- a/tests/queries/0_stateless/02354_annoy_index.sql +++ b/tests/queries/0_stateless/02354_annoy_index.sql @@ -2,6 +2,8 @@ SET allow_experimental_annoy_index = 1; +SELECT '--- Test with Array ---'; + DROP TABLE IF EXISTS tab; CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity=5; INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]); @@ -55,19 +57,87 @@ SETTINGS max_limit_for_ann_queries=2; -- doesn't use the ann index DROP TABLE tab; -DROP TABLE IF EXISTS tab; +SELECT '--- Test with Tuple ---'; -SELECT 'Negative tests'; +CREATE TABLE tab(id Int32, embedding Tuple(Float32, Float32, Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity=5; +INSERT INTO tab VALUES (1, (0.0, 0.0, 10.0)), (2, (0.0, 0.0, 10.5)), (3, (0.0, 0.0, 9.5)), (4, (0.0, 0.0, 9.7)), (5, (0.0, 0.0, 10.2)), (6, (10.0, 0.0, 0.0)), (7, (9.5, 0.0, 0.0)), (8, (9.7, 0.0, 0.0)), (9, (10.2, 0.0, 0.0)), (10, (10.5, 0.0, 0.0)), (11, (0.0, 10.0, 0.0)), (12, (0.0, 9.5, 0.0)), (13, (0.0, 9.7, 0.0)), (14, (0.0, 10.2, 0.0)), (15, (0.0, 10.5, 0.0)); + +SELECT 'WHERE type, L2Distance'; +SELECT * +FROM tab +WHERE L2Distance(embedding, (0.0, 0.0, 10.0)) < 1.0 +LIMIT 5; + +SELECT 'ORDER BY type, L2Distance'; +SELECT * +FROM tab +ORDER BY L2Distance(embedding, (0.0, 0.0, 10.0)) +LIMIT 3; + +SELECT 'WHERE type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +WHERE L2Distance(embedding, (0.0, 0.0, 10.0)) < 1.0 +LIMIT 5; + +SELECT 'ORDER BY type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +ORDER BY L2Distance(embedding, (0.0, 0.0, 10.0)) +LIMIT 3; + +SELECT 'parameter annoy_index_search_k_nodes'; +SELECT * +FROM tab +ORDER BY L2Distance(embedding, (5.3, 7.3, 2.1)) +LIMIT 5 +SETTINGS annoy_index_search_k_nodes=0; -- searches zero nodes --> no results + +SELECT 'parameter max_limit_for_ann_queries'; +EXPLAIN indexes=1 +SELECT * +FROM tab +ORDER BY L2Distance(embedding, (5.3, 7.3, 2.1)) +LIMIT 5 +SETTINGS max_limit_for_ann_queries=2; -- doesn't use the ann index + +DROP TABLE tab; + +SELECT '--- Test alternative metric (cosine distance) and non-default NumTrees ---'; + +CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy('cosineDistance', 200)) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity=5; +INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]); + +SELECT 'WHERE type, L2Distance'; +SELECT * +FROM tab +WHERE L2Distance(embedding, [0.0, 0.0, 10.0]) < 1.0 +LIMIT 5; + +SELECT 'ORDER BY type, L2Distance'; +SELECT * +FROM tab +ORDER BY L2Distance(embedding, [0.0, 0.0, 10.0]) +LIMIT 3; + +DROP TABLE tab; + +SELECT '--- Negative tests ---'; -- must have at most 2 arguments CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy('too', 'many', 'arguments')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } --- first argument must be UInt64 +-- first argument (distance_function) must be String CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } --- 2nd argument must be String +-- 2nd argument (number of trees) must be UInt64 CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy('L2Distance', 'not an UInt64')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +-- reject unsupported distance functions +CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy('wormholeDistance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } + -- must be created on single column CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index (embedding, id) TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS } @@ -77,6 +147,3 @@ CREATE TABLE tab(id Int32, embedding Float32, INDEX annoy_index embedding TYPE a CREATE TABLE tab(id Int32, embedding Array(Float64), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } CREATE TABLE tab(id Int32, embedding LowCardinality(Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } CREATE TABLE tab(id Int32, embedding Nullable(Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } - --- reject unsupported distance functions -CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy('wormholeDistance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } From 8f60423daa828d9f4ce8f13b90e1c42de22f9edd Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 5 Jun 2023 10:55:05 +0000 Subject: [PATCH 0463/1072] Cosmetics: more constness --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 12 ++++++------ src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 5 +---- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index cf7fbb3bab3..72dd92ead5e 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -236,22 +236,22 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRanges(MergeTreeIndex template std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const { - UInt64 limit = ann_condition.getLimit(); - UInt64 index_granularity = ann_condition.getIndexGranularity(); - std::optional comparison_distance = ann_condition.getQueryType() == ApproximateNearestNeighborInformation::Type::Where + const UInt64 limit = ann_condition.getLimit(); + const UInt64 index_granularity = ann_condition.getIndexGranularity(); + const std::optional comparison_distance = ann_condition.getQueryType() == ApproximateNearestNeighborInformation::Type::Where ? std::optional(ann_condition.getComparisonDistanceForWhereQuery()) : std::nullopt; if (comparison_distance && comparison_distance.value() < 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to optimize query with where without distance"); - std::vector reference_vector = ann_condition.getReferenceVector(); + const std::vector reference_vector = ann_condition.getReferenceVector(); - auto granule = std::dynamic_pointer_cast>(idx_granule); + const auto granule = std::dynamic_pointer_cast>(idx_granule); if (granule == nullptr) throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule has the wrong type"); - AnnoyIndexWithSerializationPtr annoy = granule->index; + const AnnoyIndexWithSerializationPtr annoy = granule->index; if (ann_condition.getNumOfDimensions() != annoy->getNumOfDimensions()) throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) " diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index bca06edd0f8..5204ff07b27 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -66,15 +66,13 @@ public: MergeTreeIndexConditionAnnoy( const IndexDescription & index_description, const SelectQueryInfo & query, - const String& distance_function, + const String & distance_function, ContextPtr context); ~MergeTreeIndexConditionAnnoy() override = default; bool alwaysUnknownOrTrue() const override; - bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const override; - std::vector getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const override; private: @@ -97,7 +95,6 @@ public: MergeTreeIndexGranulePtr createIndexGranule() const override; MergeTreeIndexAggregatorPtr createIndexAggregator() const override; - MergeTreeIndexConditionPtr createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const override; bool mayBenefitFromIndexForIn(const ASTPtr & /*node*/) const override { return false; } From d9a6e36685f71ed592af24a7e951272a688b9eea Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 5 Jun 2023 11:02:23 +0000 Subject: [PATCH 0464/1072] Add comments --- src/Analyzer/TableNode.cpp | 3 +++ src/Planner/PlannerJoinTree.cpp | 3 +++ src/Planner/Utils.cpp | 3 +++ src/Storages/StorageDistributed.cpp | 5 +++-- 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/Analyzer/TableNode.cpp b/src/Analyzer/TableNode.cpp index 17d12bd6afa..f899c1ae6fe 100644 --- a/src/Analyzer/TableNode.cpp +++ b/src/Analyzer/TableNode.cpp @@ -91,6 +91,9 @@ ASTPtr TableNode::toASTImpl(const ConvertToASTOptions & /* options */) const if (!temporary_table_name.empty()) return std::make_shared(temporary_table_name); + // In case of cross-replication we don't know what database is used for the table. + // `storage_id.hasDatabase()` can return false only on the initiator node. + // Each shard will use the default database (in the case of cross-replication shards may have different defaults). if (!storage_id.hasDatabase()) return std::make_shared(storage_id.getTableName()); return std::make_shared(storage_id.getDatabaseName(), storage_id.getTableName()); diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index 4a1708f96d3..8bd674f5b67 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -106,6 +106,9 @@ void checkAccessRights(const TableNode & table_node, const Names & column_names, storage_id.getFullTableName()); } + // In case of cross-replication we don't know what database is used for the table. + // `storage_id.hasDatabase()` can return false only on the initiator node. + // Each shard will use the default database (in the case of cross-replication shards may have different defaults). if (storage_id.hasDatabase()) query_context->checkAccess(AccessType::SELECT, storage_id, column_names); } diff --git a/src/Planner/Utils.cpp b/src/Planner/Utils.cpp index cd4fb9182e9..94f3bbf6440 100644 --- a/src/Planner/Utils.cpp +++ b/src/Planner/Utils.cpp @@ -107,6 +107,9 @@ Block buildCommonHeaderForUnion(const Blocks & queries_headers, SelectUnionMode ASTPtr queryNodeToSelectQuery(const QueryTreeNodePtr & query_node) { auto & query_node_typed = query_node->as(); + + // In case of cross-replication we don't know what database is used for the table. + // Each shard will use the default database (in the case of cross-replication shards may have different defaults). auto result_ast = query_node_typed.toAST({ .fully_qualified_identifiers = false }); while (true) diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index b9625ce2ab7..969c117cb28 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -30,8 +30,6 @@ #include #include #include -#include "Analyzer/IQueryTreeNode.h" -#include "Analyzer/MatcherNode.h" #include #include @@ -946,6 +944,9 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, else { auto resolved_remote_storage_id = remote_storage_id; + // In case of cross-replication we don't know what database is used for the table. + // `storage_id.hasDatabase()` can return false only on the initiator node. + // Each shard will use the default database (in the case of cross-replication shards may have different defaults). if (remote_storage_id.hasDatabase()) resolved_remote_storage_id = query_context->resolveStorageID(remote_storage_id); From f3a8517a447daeafd6c5bad0b819a2122cf6161c Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 5 Jun 2023 13:07:07 +0200 Subject: [PATCH 0465/1072] Fix --- src/Interpreters/Cache/FileCache.cpp | 2 +- src/Interpreters/Cache/Metadata.cpp | 10 ++++++++++ src/Interpreters/Cache/Metadata.h | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp index 65dca790183..5ccbe6ad72d 100644 --- a/src/Interpreters/Cache/FileCache.cpp +++ b/src/Interpreters/Cache/FileCache.cpp @@ -123,7 +123,7 @@ FileSegments FileCache::getImpl(const LockedKey & locked_key, const FileSegment: auto add_to_result = [&](const FileSegmentMetadata & file_segment_metadata) { FileSegmentPtr file_segment; - if (file_segment_metadata.valid()) + if (!file_segment_metadata.evicting()) { file_segment = file_segment_metadata.file_segment; if (file_segment->isDownloaded()) diff --git a/src/Interpreters/Cache/Metadata.cpp b/src/Interpreters/Cache/Metadata.cpp index 843ffd45b63..fea552c4071 100644 --- a/src/Interpreters/Cache/Metadata.cpp +++ b/src/Interpreters/Cache/Metadata.cpp @@ -346,6 +346,16 @@ void LockedKey::removeAllReleasable() ++it; continue; } + else if (it->second.evicting()) + { + /// File segment is currently a removal candidate, + /// we do not know if it will be removed or not yet, + /// but its size is currently accounted as potentially removed, + /// so if we remove file segment now, we break the freeable_count + /// calculation in tryReserve. + ++it; + continue; + } auto file_segment = it->second->file_segment; it = removeFileSegment(file_segment->offset(), file_segment->lock()); diff --git a/src/Interpreters/Cache/Metadata.h b/src/Interpreters/Cache/Metadata.h index 2e015b07ed0..4732123fabc 100644 --- a/src/Interpreters/Cache/Metadata.h +++ b/src/Interpreters/Cache/Metadata.h @@ -22,7 +22,7 @@ struct FileSegmentMetadata : private boost::noncopyable size_t size() const; - bool valid() const { return !removal_candidate.load(); } + bool evicting() const { return !removal_candidate.load(); } Priority::Iterator getQueueIterator() const { return file_segment->getQueueIterator(); } From 2866bac08932a841595fb788e84bc9be3b07a32f Mon Sep 17 00:00:00 2001 From: Alexander Sapin Date: Mon, 5 Jun 2023 14:03:19 +0200 Subject: [PATCH 0466/1072] Add named collections and remove host filter support --- src/Storages/StorageAzure.cpp | 78 ++++++++++++++++++- src/Storages/StorageAzure.h | 9 +-- .../configs/named_collections.xml | 15 ++++ .../test_storage_azure_blob_storage/test.py | 14 ++++ 4 files changed, 106 insertions(+), 10 deletions(-) create mode 100644 tests/integration/test_storage_azure_blob_storage/configs/named_collections.xml diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index 7b4bc9e6769..8ce8a923c33 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -45,11 +46,61 @@ namespace ErrorCodes extern const int DATABASE_ACCESS_DENIED; } +namespace +{ + +static const std::unordered_set required_configuration_keys = { + "blob_path", + "container", +}; + +static const std::unordered_set optional_configuration_keys = { + "format", + "compression", + "compression_method", + "account_name", + "account_key", + "connection_string", + "storage_account_url", +}; + bool isConnectionString(const std::string & candidate) { return candidate.starts_with("DefaultEndpointsProtocol"); } + +void processNamedCollectionResult(StorageAzure::Configuration & configuration, const NamedCollection & collection) +{ + validateNamedCollection(collection, required_configuration_keys, optional_configuration_keys); + + if (collection.has("connection_string")) + { + configuration.connection_url = collection.get("connection_string"); + configuration.is_connection_string = true; + } + + if (collection.has("storage_account_url")) + { + configuration.connection_url = collection.get("storage_account_url"); + configuration.is_connection_string = false; + } + + configuration.container = collection.get("container"); + configuration.blob_path = collection.get("blob_path"); + + if (collection.has("account_name")) + configuration.account_name = collection.get("account_name"); + + if (collection.has("account_key")) + configuration.account_key = collection.get("account_key"); + + configuration.format = collection.getOrDefault("format", configuration.format); + configuration.compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); +} + +} + StorageAzure::Configuration StorageAzure::getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file) { StorageAzure::Configuration configuration; @@ -57,6 +108,19 @@ StorageAzure::Configuration StorageAzure::getConfiguration(ASTs & engine_args, C /// Supported signatures: /// /// Azure(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression]) + /// + + if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) + { + processNamedCollectionResult(configuration, *named_collection); + + configuration.blobs_paths = {configuration.blob_path}; + + if (configuration.format == "auto" && get_format_from_file) + configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + + return configuration; + } if (engine_args.size() < 3 || engine_args.size() > 7) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, @@ -274,10 +338,20 @@ AzureClientPtr StorageAzure::createClient(StorageAzure::Configuration configurat return result; } +Poco::URI StorageAzure::Configuration::getConnectionURL() const +{ + if (!is_connection_string) + return Poco::URI(connection_url); + + auto parsed_connection_string = Azure::Storage::_internal::ParseConnectionString(connection_url); + return Poco::URI(parsed_connection_string.BlobServiceUrl.GetAbsoluteUrl()); +} + + StorageAzure::StorageAzure( const Configuration & configuration_, std::unique_ptr && object_storage_, - ContextPtr, + ContextPtr context, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -293,7 +367,7 @@ StorageAzure::StorageAzure( , partition_by(partition_by_) { FormatFactory::instance().checkFormatName(configuration.format); - //context_->getGlobalContext()->getRemoteHostFilter().checkURL(Poco::URI(configuration.getConnectionURL())); + context->getGlobalContext()->getRemoteHostFilter().checkURL(configuration.getConnectionURL()); StorageInMemoryMetadata storage_metadata; if (columns_.empty()) diff --git a/src/Storages/StorageAzure.h b/src/Storages/StorageAzure.h index 6bf18d91265..03a9abf2ce6 100644 --- a/src/Storages/StorageAzure.h +++ b/src/Storages/StorageAzure.h @@ -51,13 +51,7 @@ public: return blobs_paths.back().find(PARTITION_ID_WILDCARD) != String::npos; } - std::string getConnectionURL() const - { - if (!is_connection_string) - return connection_url; - - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Connection string not implemented yet"); - } + Poco::URI getConnectionURL() const; std::string connection_url; bool is_connection_string; @@ -121,7 +115,6 @@ private: const bool distributed_processing; std::optional format_settings; ASTPtr partition_by; - }; } diff --git a/tests/integration/test_storage_azure_blob_storage/configs/named_collections.xml b/tests/integration/test_storage_azure_blob_storage/configs/named_collections.xml new file mode 100644 index 00000000000..dc70895bc05 --- /dev/null +++ b/tests/integration/test_storage_azure_blob_storage/configs/named_collections.xml @@ -0,0 +1,15 @@ + + + + DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1; + cont + test_simple_write_named.csv + CSV + + + http://azurite1:10000/devstoreaccount1 + devstoreaccount1 + Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw== + + + diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 94b059fe4fe..11404602f58 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -23,6 +23,7 @@ def cluster(): cluster = ClickHouseCluster(__file__) cluster.add_instance( "node", + main_configs=["configs/named_collections.xml"], with_azurite=True, ) cluster.start() @@ -80,6 +81,19 @@ def test_simple_write_connection_string(cluster): print(get_azure_file_content('test_simple_write_c.csv')) assert get_azure_file_content('test_simple_write_c.csv') == '1,"a"\n' +def test_simple_write_named_collection_1(cluster): + node = cluster.instances["node"] + azure_query(node, "CREATE TABLE test_simple_write_named_collection_1 (key UInt64, data String) Engine = Azure(azure_conf1)") + azure_query(node, "INSERT INTO test_simple_write_named_collection_1 VALUES (1, 'a')") + print(get_azure_file_content('test_simple_write_named.csv')) + assert get_azure_file_content('test_simple_write_named.csv') == '1,"a"\n' + +def test_simple_write_named_collection_2(cluster): + node = cluster.instances["node"] + azure_query(node, "CREATE TABLE test_simple_write_named_collection_2 (key UInt64, data String) Engine = Azure(azure_conf2, container='cont', blob_path='test_simple_write_named_2.csv', format='CSV')") + azure_query(node, "INSERT INTO test_simple_write_named_collection_2 VALUES (1, 'a')") + print(get_azure_file_content('test_simple_write_named_2.csv')) + assert get_azure_file_content('test_simple_write_named_2.csv') == '1,"a"\n' def test_partition_by(cluster): node = cluster.instances["node"] From 1713dbe6318b310ba4414e8f8804fe5e0c8a155f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Mon, 5 Jun 2023 12:07:05 +0000 Subject: [PATCH 0467/1072] Grant select on system tables to test user --- tests/queries/0_stateless/02771_system_user_processes.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02771_system_user_processes.sh b/tests/queries/0_stateless/02771_system_user_processes.sh index f0e5b2a6987..8e2fbfb5287 100755 --- a/tests/queries/0_stateless/02771_system_user_processes.sh +++ b/tests/queries/0_stateless/02771_system_user_processes.sh @@ -16,6 +16,7 @@ fi $CLICKHOUSE_CLIENT -q "DROP USER IF EXISTS $USER" $CLICKHOUSE_CLIENT -q "CREATE USER $USER" +$CLICKHOUSE_CLIENT -q "GRANT SELECT ON system.* TO $USER" $CLICKHOUSE_CLIENT -u "$USER" -q "SELECT * FROM system.numbers LIMIT 1" $CLICKHOUSE_CLIENT -u "$USER" -q "SELECT * FROM system.numbers LIMIT 1" $CLICKHOUSE_CLIENT -q "SELECT user, toBool(ProfileEvents['SelectQuery'] > 0), toBool(ProfileEvents['Query'] > 0) FROM system.user_processes WHERE user='default'" From c8d85a43c70f96de5c5ce922a43897425566aa00 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 5 Jun 2023 12:22:32 +0000 Subject: [PATCH 0468/1072] Fix unit tests --- src/Analyzer/ColumnNode.cpp | 4 ++-- src/Analyzer/IQueryTreeNode.h | 5 ++++- src/Planner/Utils.cpp | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/Analyzer/ColumnNode.cpp b/src/Analyzer/ColumnNode.cpp index dd41522ac7d..3d9f5d1640e 100644 --- a/src/Analyzer/ColumnNode.cpp +++ b/src/Analyzer/ColumnNode.cpp @@ -96,7 +96,7 @@ ASTPtr ColumnNode::toASTImpl(const ConvertToASTOptions & options) const std::vector column_identifier_parts; auto column_source = getColumnSourceOrNull(); - if (column_source) + if (column_source && options.fully_qualified_identifiers) { auto node_type = column_source->getNodeType(); if (node_type == QueryTreeNodeType::TABLE || @@ -117,7 +117,7 @@ ASTPtr ColumnNode::toASTImpl(const ConvertToASTOptions & options) const else { const auto & table_storage_id = table_node->getStorageID(); - if (table_storage_id.hasDatabase() && options.fully_qualified_identifiers) + if (table_storage_id.hasDatabase() && options.qualify_indentifiers_with_database) column_identifier_parts = { table_storage_id.getDatabaseName(), table_storage_id.getTableName() }; else column_identifier_parts = { table_storage_id.getTableName() }; diff --git a/src/Analyzer/IQueryTreeNode.h b/src/Analyzer/IQueryTreeNode.h index 351d03bc8cb..763963b734a 100644 --- a/src/Analyzer/IQueryTreeNode.h +++ b/src/Analyzer/IQueryTreeNode.h @@ -187,10 +187,13 @@ public: /// Identifiers are fully qualified (`database.table.column`), otherwise names are just column names (`column`) bool fully_qualified_identifiers = true; + + /// Identifiers are qualified but database name is not added (`table.column`) if set to false. + bool qualify_indentifiers_with_database = true; }; /// Convert query tree to AST - ASTPtr toAST(const ConvertToASTOptions & options = { .add_cast_for_constants = true, .fully_qualified_identifiers = true }) const; + ASTPtr toAST(const ConvertToASTOptions & options = { .add_cast_for_constants = true, .fully_qualified_identifiers = true, .qualify_indentifiers_with_database = true }) const; /// Convert query tree to AST and then format it for error message. String formatConvertedASTForErrorMessage() const; diff --git a/src/Planner/Utils.cpp b/src/Planner/Utils.cpp index 94f3bbf6440..733db0f00bc 100644 --- a/src/Planner/Utils.cpp +++ b/src/Planner/Utils.cpp @@ -110,7 +110,7 @@ ASTPtr queryNodeToSelectQuery(const QueryTreeNodePtr & query_node) // In case of cross-replication we don't know what database is used for the table. // Each shard will use the default database (in the case of cross-replication shards may have different defaults). - auto result_ast = query_node_typed.toAST({ .fully_qualified_identifiers = false }); + auto result_ast = query_node_typed.toAST({ .qualify_indentifiers_with_database = false }); while (true) { From 90e9df9109433971d70d544e848eeead361b96f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Mon, 5 Jun 2023 12:27:46 +0000 Subject: [PATCH 0469/1072] Revert "Add `SHOW USER PROCESSES` query" This reverts commit d28b4181e94c5602b5512af8ed541dcc2a1a55f2. --- src/Interpreters/InterpreterFactory.cpp | 6 ---- .../InterpreterShowUserProcessesQuery.cpp | 18 ----------- .../InterpreterShowUserProcessesQuery.h | 30 ----------------- src/Parsers/ASTShowUserProcessesQuery.h | 17 ---------- src/Parsers/ParserQueryWithOutput.cpp | 5 +-- src/Parsers/ParserShowUserProcessesQuery.h | 32 ------------------- 6 files changed, 1 insertion(+), 107 deletions(-) delete mode 100644 src/Interpreters/InterpreterShowUserProcessesQuery.cpp delete mode 100644 src/Interpreters/InterpreterShowUserProcessesQuery.h delete mode 100644 src/Parsers/ASTShowUserProcessesQuery.h delete mode 100644 src/Parsers/ParserShowUserProcessesQuery.h diff --git a/src/Interpreters/InterpreterFactory.cpp b/src/Interpreters/InterpreterFactory.cpp index c31e3801478..9cd1f2a251c 100644 --- a/src/Interpreters/InterpreterFactory.cpp +++ b/src/Interpreters/InterpreterFactory.cpp @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -82,7 +81,6 @@ #include #include #include -#include #include #include #include @@ -268,10 +266,6 @@ std::unique_ptr InterpreterFactory::get(ASTPtr & query, ContextMut { return std::make_unique(query, context); } - else if (query->as()) - { - return std::make_unique(query, context); - } else if (query->as()) { return std::make_unique(query, context); diff --git a/src/Interpreters/InterpreterShowUserProcessesQuery.cpp b/src/Interpreters/InterpreterShowUserProcessesQuery.cpp deleted file mode 100644 index 51287a7ad5b..00000000000 --- a/src/Interpreters/InterpreterShowUserProcessesQuery.cpp +++ /dev/null @@ -1,18 +0,0 @@ -#include - -#include -#include -#include - -#include - - -namespace DB -{ - -BlockIO InterpreterShowUserProcessesQuery::execute() -{ - return executeQuery("SELECT * FROM system.user_processes ORDER BY user DESC", getContext(), true); -} - -} diff --git a/src/Interpreters/InterpreterShowUserProcessesQuery.h b/src/Interpreters/InterpreterShowUserProcessesQuery.h deleted file mode 100644 index a1c385dc82f..00000000000 --- a/src/Interpreters/InterpreterShowUserProcessesQuery.h +++ /dev/null @@ -1,30 +0,0 @@ -#pragma once - -#include -#include - - -namespace DB -{ - -/** Return list of currently executing queries. -TODO(antaljanosbenjamin) - */ -class InterpreterShowUserProcessesQuery : public IInterpreter, WithMutableContext -{ -public: - InterpreterShowUserProcessesQuery(const ASTPtr & query_ptr_, ContextMutablePtr context_) - : WithMutableContext(context_), query_ptr(query_ptr_) {} - - BlockIO execute() override; - - /// We ignore the quota and limits here because execute() will rewrite a show query as a SELECT query and then - /// the SELECT query will checks the quota and limits. - bool ignoreQuota() const override { return true; } - bool ignoreLimits() const override { return true; } - -private: - ASTPtr query_ptr; -}; - -} diff --git a/src/Parsers/ASTShowUserProcessesQuery.h b/src/Parsers/ASTShowUserProcessesQuery.h deleted file mode 100644 index cd522c152b6..00000000000 --- a/src/Parsers/ASTShowUserProcessesQuery.h +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once - -#include - - -namespace DB -{ - -struct ASTShowUserProcessesIDAndQueryNames -{ - static constexpr auto ID = "ShowUserProcesses"; - static constexpr auto Query = "SHOW USER PROCESSES"; -}; - -using ASTShowUserProcessesQuery = ASTQueryWithOutputImpl; - -} diff --git a/src/Parsers/ParserQueryWithOutput.cpp b/src/Parsers/ParserQueryWithOutput.cpp index d5293e5f709..6796f4528c4 100644 --- a/src/Parsers/ParserQueryWithOutput.cpp +++ b/src/Parsers/ParserQueryWithOutput.cpp @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -62,7 +61,6 @@ bool ParserQueryWithOutput::parseImpl(Pos & pos, ASTPtr & node, Expected & expec ParserShowGrantsQuery show_grants_p; ParserShowPrivilegesQuery show_privileges_p; ParserExplainQuery explain_p(end, allow_settings_after_format_in_insert); - ParserShowUserProcessesQuery show_user_processes_p; ASTPtr query; @@ -90,8 +88,7 @@ bool ParserQueryWithOutput::parseImpl(Pos & pos, ASTPtr & node, Expected & expec || show_access_p.parse(pos, query, expected) || show_access_entities_p.parse(pos, query, expected) || show_grants_p.parse(pos, query, expected) - || show_privileges_p.parse(pos, query, expected) - || show_user_processes_p.parse(pos, query, expected); + || show_privileges_p.parse(pos, query, expected); if (!parsed) return false; diff --git a/src/Parsers/ParserShowUserProcessesQuery.h b/src/Parsers/ParserShowUserProcessesQuery.h deleted file mode 100644 index be484e74d5d..00000000000 --- a/src/Parsers/ParserShowUserProcessesQuery.h +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include -#include -#include -#include - - -namespace DB -{ - -/** Query SHOW USER PROCESSES - */ -class ParserShowUserProcessesQuery : public IParserBase -{ -protected: - const char * getName() const override { return "SHOW USER PROCESSES query"; } - - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override - { - auto query = std::make_shared(); - - if (!ParserKeyword("SHOW USER PROCESSES").ignore(pos, expected)) - return false; - - node = query; - - return true; - } -}; - -} From 28eb9562b8a8904941a4b59e8ecf1c5b97aa70cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Mon, 5 Jun 2023 12:29:11 +0000 Subject: [PATCH 0470/1072] Remove the usage of `SHOW USER PROCESSES` from tests --- .../0_stateless/02771_system_user_processes.reference | 1 - tests/queries/0_stateless/02771_system_user_processes.sh | 7 ------- 2 files changed, 8 deletions(-) diff --git a/tests/queries/0_stateless/02771_system_user_processes.reference b/tests/queries/0_stateless/02771_system_user_processes.reference index 8c8ca8abb52..a55207ff3f4 100644 --- a/tests/queries/0_stateless/02771_system_user_processes.reference +++ b/tests/queries/0_stateless/02771_system_user_processes.reference @@ -1,4 +1,3 @@ -SHOW USER PROCESSES query succeeded! 0 0 default true true diff --git a/tests/queries/0_stateless/02771_system_user_processes.sh b/tests/queries/0_stateless/02771_system_user_processes.sh index 8e2fbfb5287..c680283d36e 100755 --- a/tests/queries/0_stateless/02771_system_user_processes.sh +++ b/tests/queries/0_stateless/02771_system_user_processes.sh @@ -7,13 +7,6 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) USER_POSTFIX=`random_str 10` USER="test_user_02771_$USER_POSTFIX" -if $CLICKHOUSE_CLIENT -q "SHOW USER PROCESSES" &>"${CLICKHOUSE_TMP}/test_output" -then - echo "SHOW USER PROCESSES query succeeded!" -else - cat "${CLICKHOUSE_TMP}/test_output" -fi - $CLICKHOUSE_CLIENT -q "DROP USER IF EXISTS $USER" $CLICKHOUSE_CLIENT -q "CREATE USER $USER" $CLICKHOUSE_CLIENT -q "GRANT SELECT ON system.* TO $USER" From fe1354f22184181c3ed996e0928509917bdc5f7d Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 5 Jun 2023 12:32:08 +0000 Subject: [PATCH 0471/1072] Analyzer: Do not apply Query Tree optimizations on shards --- src/Interpreters/InterpreterSelectQueryAnalyzer.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/InterpreterSelectQueryAnalyzer.cpp b/src/Interpreters/InterpreterSelectQueryAnalyzer.cpp index 98f70c25dcd..4f2f05dc7eb 100644 --- a/src/Interpreters/InterpreterSelectQueryAnalyzer.cpp +++ b/src/Interpreters/InterpreterSelectQueryAnalyzer.cpp @@ -135,7 +135,8 @@ QueryTreeNodePtr buildQueryTreeAndRunPasses(const ASTPtr & query, QueryTreePassManager query_tree_pass_manager(context); addQueryTreePasses(query_tree_pass_manager); - if (select_query_options.ignore_ast_optimizations) + if (select_query_options.ignore_ast_optimizations + || context->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY) query_tree_pass_manager.run(query_tree, 1 /*up_to_pass_index*/); else query_tree_pass_manager.run(query_tree); From 5cb2d8b4e2849b015682ea05f2105258162f3335 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Mon, 5 Jun 2023 12:32:25 +0000 Subject: [PATCH 0472/1072] Update failing tests --- tests/queries/0_stateless/01399_http_request_headers.reference | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/01399_http_request_headers.reference b/tests/queries/0_stateless/01399_http_request_headers.reference index 90a10a9818d..92ea6606a12 100644 --- a/tests/queries/0_stateless/01399_http_request_headers.reference +++ b/tests/queries/0_stateless/01399_http_request_headers.reference @@ -6,6 +6,7 @@ Code: 516 1 Code: 516 processes +processes Code: 81 [1] Code: 73 From c3d6e4c9155b22fe24018ad0099eef2ccd787f5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Mon, 5 Jun 2023 12:36:19 +0000 Subject: [PATCH 0473/1072] Fix docs --- docs/en/operations/system-tables/user_processes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/system-tables/user_processes.md b/docs/en/operations/system-tables/user_processes.md index a9b97390ed6..94c153fb683 100644 --- a/docs/en/operations/system-tables/user_processes.md +++ b/docs/en/operations/system-tables/user_processes.md @@ -3,7 +3,7 @@ slug: /en/operations/system-tables/user_processes --- # user_processes -This system table is used for implementing the `SHOW USER PROCESSES` query. +This system table can be used to get overview of memory usage and ProfileEvents of users. Columns: From 4f0adf5f61fca0162184e5d4858c75fdb0a10e2e Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 5 Jun 2023 12:40:54 +0000 Subject: [PATCH 0474/1072] Better support for avro decimals --- .../Formats/Impl/AvroRowInputFormat.cpp | 5 +++-- .../0_stateless/02782_avro_decimals.reference | 13 +++++++++++++ tests/queries/0_stateless/02782_avro_decimals.sh | 10 ++++++++++ tests/queries/0_stateless/data_avro/decimals.avro | Bin 0 -> 295 bytes 4 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 tests/queries/0_stateless/02782_avro_decimals.reference create mode 100755 tests/queries/0_stateless/02782_avro_decimals.sh create mode 100644 tests/queries/0_stateless/data_avro/decimals.avro diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index c2602a4d1d5..267f9e522e2 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -176,14 +176,15 @@ static AvroDeserializer::DeserializeFn createDecimalDeserializeFn(const avro::No { static constexpr size_t field_type_size = sizeof(typename DecimalType::FieldType); decoder.decodeString(tmp); - if (tmp.size() != field_type_size) + if (tmp.size() > field_type_size) throw ParsingException( ErrorCodes::CANNOT_PARSE_UUID, - "Cannot parse type {}, expected binary data with size {}, got {}", + "Cannot parse type {}, expected binary data with size less then {}, got {}", target_type->getName(), field_type_size, tmp.size()); + tmp = std::string(field_type_size - tmp.size(), '\0') + tmp; typename DecimalType::FieldType field; ReadBufferFromString buf(tmp); readBinaryBigEndian(field.value, buf); diff --git a/tests/queries/0_stateless/02782_avro_decimals.reference b/tests/queries/0_stateless/02782_avro_decimals.reference new file mode 100644 index 00000000000..ed46f1c3758 --- /dev/null +++ b/tests/queries/0_stateless/02782_avro_decimals.reference @@ -0,0 +1,13 @@ +d Decimal(14, 4) +0 +1 +1.1 +12.12 +123.123 +1234.1234 +12345.1234 +123456.1234 +1234567.1234 +12345678.1234 +123456789.1234 +1234567890.1234 diff --git a/tests/queries/0_stateless/02782_avro_decimals.sh b/tests/queries/0_stateless/02782_avro_decimals.sh new file mode 100755 index 00000000000..5b754965806 --- /dev/null +++ b/tests/queries/0_stateless/02782_avro_decimals.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_LOCAL -q "desc file('$CUR_DIR/data_avro/decimals.avro')" +$CLICKHOUSE_LOCAL -q "select * from file('$CUR_DIR/data_avro/decimals.avro')" + diff --git a/tests/queries/0_stateless/data_avro/decimals.avro b/tests/queries/0_stateless/data_avro/decimals.avro new file mode 100644 index 0000000000000000000000000000000000000000..5c29ac235d59003696735c8c8092eed5bcce41b2 GIT binary patch literal 295 zcmeZI%3@>@Nh~YM*GtY%NloU+E6vFf1M`cMGg5OCFECXrB@y Date: Mon, 5 Jun 2023 12:43:38 +0000 Subject: [PATCH 0475/1072] Fix exception message --- src/Processors/Formats/Impl/AvroRowInputFormat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 267f9e522e2..ae65960a372 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -179,7 +179,7 @@ static AvroDeserializer::DeserializeFn createDecimalDeserializeFn(const avro::No if (tmp.size() > field_type_size) throw ParsingException( ErrorCodes::CANNOT_PARSE_UUID, - "Cannot parse type {}, expected binary data with size less then {}, got {}", + "Cannot parse type {}, expected binary data with size equal to or less than {}, got {}", target_type->getName(), field_type_size, tmp.size()); From aa20935cb9913d628df08e80b793ad853363203a Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 5 Jun 2023 12:45:14 +0000 Subject: [PATCH 0476/1072] Better --- src/Processors/Formats/Impl/AvroRowInputFormat.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index ae65960a372..201845177a5 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -183,8 +183,10 @@ static AvroDeserializer::DeserializeFn createDecimalDeserializeFn(const avro::No target_type->getName(), field_type_size, tmp.size()); + else if (tmp.size() != field_type_size) + /// Add padding with 0-bytes. + tmp = std::string(field_type_size - tmp.size(), '\0') + tmp; - tmp = std::string(field_type_size - tmp.size(), '\0') + tmp; typename DecimalType::FieldType field; ReadBufferFromString buf(tmp); readBinaryBigEndian(field.value, buf); From bc8ee56a19489bd2d42fd06fdca82ff1948236ca Mon Sep 17 00:00:00 2001 From: Alexander Sapin Date: Mon, 5 Jun 2023 14:46:52 +0200 Subject: [PATCH 0477/1072] Support settings, test truncate --- src/Core/Settings.h | 5 +++ src/Storages/StorageAzure.cpp | 37 ++++++++++++++++--- src/Storages/StorageAzure.h | 8 ++++ .../test_storage_azure_blob_storage/test.py | 9 +++++ 4 files changed, 53 insertions(+), 6 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 6a0833aef60..a484e8e816d 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -81,7 +81,9 @@ class IColumn; M(UInt64, s3_upload_part_size_multiply_parts_count_threshold, 500, "Each time this number of parts was uploaded to S3 s3_min_upload_part_size multiplied by s3_upload_part_size_multiply_factor.", 0) \ M(UInt64, s3_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited. You ", 0) \ M(UInt64, s3_max_single_part_upload_size, 32*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \ + M(UInt64, azure_max_single_part_upload_size, 100*1024*1024, "The maximum size of object to upload using singlepart upload to Azure blob storage.", 0) \ M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \ + M(UInt64, azure_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \ M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \ M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \ M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \ @@ -90,8 +92,11 @@ class IColumn; M(UInt64, s3_max_put_rps, 0, "Limit on S3 PUT request per second rate before throttling. Zero means unlimited.", 0) \ M(UInt64, s3_max_put_burst, 0, "Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_put_rps`", 0) \ M(UInt64, s3_list_object_keys_size, 1000, "Maximum number of files that could be returned in batch by ListObject request", 0) \ + M(UInt64, azure_list_object_keys_size, 1000, "Maximum number of files that could be returned in batch by ListObject request", 0) \ M(Bool, s3_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables.", 0) \ + M(Bool, azure_truncate_on_insert, false, "Enables or disables truncate before insert in azure engine tables.", 0) \ M(Bool, s3_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in s3 engine tables", 0) \ + M(Bool, azure_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in azure engine tables", 0) \ M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \ M(Bool, s3_allow_parallel_part_upload, true, "Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage", 0) \ M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index 8ce8a923c33..3d519ade3dc 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -255,9 +255,15 @@ void registerStorageAzure(StorageFactory & factory) if (args.storage_def->partition_by) partition_by = args.storage_def->partition_by->clone(); + const auto & context_settings = args.getContext()->getSettingsRef(); + auto settings = std::make_unique(); + settings->max_single_part_upload_size = context_settings.azure_max_single_part_upload_size; + settings->max_single_read_retries = context_settings.azure_max_single_read_retries; + settings->list_object_keys_size = static_cast(context_settings.azure_list_object_keys_size); + return std::make_shared( std::move(configuration), - std::make_unique("AzureStorage", std::move(client), std::make_unique()), + std::make_unique("AzureStorage", std::move(client), std::move(settings)), args.getContext(), args.table_id, args.columns, @@ -395,7 +401,6 @@ StorageAzure::StorageAzure( void StorageAzure::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &) { - if (configuration.withGlobs()) { throw Exception( @@ -577,12 +582,12 @@ SinkToStoragePtr StorageAzure::write(const ASTPtr & query, const StorageMetadata throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, "Azure key '{}' contains globs, so the table is in readonly mode", configuration.blob_path); - bool truncate_in_insert = local_context->getSettingsRef().s3_truncate_on_insert; + bool truncate_in_insert = local_context->getSettingsRef().azure_truncate_on_insert; if (!truncate_in_insert && object_storage->exists(StoredObject(configuration.blob_path))) { - if (local_context->getSettingsRef().s3_create_new_file_on_insert) + if (local_context->getSettingsRef().azure_create_new_file_on_insert) { size_t index = configuration.blobs_paths.size(); const auto & first_key = configuration.blobs_paths[0]; @@ -603,8 +608,8 @@ SinkToStoragePtr StorageAzure::write(const ASTPtr & query, const StorageMetadata throw Exception( ErrorCodes::BAD_ARGUMENTS, "Object in bucket {} with key {} already exists. " - "If you want to overwrite it, enable setting s3_truncate_on_insert, if you " - "want to create a new file on each insert, enable setting s3_create_new_file_on_insert", + "If you want to overwrite it, enable setting azure_truncate_on_insert, if you " + "want to create a new file on each insert, enable setting azure_create_new_file_on_insert", configuration.container, configuration.blobs_paths.back()); } } @@ -630,6 +635,26 @@ bool StorageAzure::supportsPartitionBy() const return true; } +bool StorageAzure::supportsSubcolumns() const +{ + return FormatFactory::instance().checkIfFormatSupportsSubcolumns(configuration.format); +} + +bool StorageAzure::supportsSubsetOfColumns() const +{ + return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format); +} + +bool StorageAzure::prefersLargeBlocks() const +{ + return FormatFactory::instance().checkIfOutputFormatPrefersLargeBlocks(configuration.format); +} + +bool StorageAzure::parallelizeOutputAfterReading(ContextPtr context) const +{ + return FormatFactory::instance().checkParallelizeOutputAfterReading(configuration.format, context); +} + } #endif diff --git a/src/Storages/StorageAzure.h b/src/Storages/StorageAzure.h index 03a9abf2ce6..255d7e713e7 100644 --- a/src/Storages/StorageAzure.h +++ b/src/Storages/StorageAzure.h @@ -103,6 +103,14 @@ public: bool supportsPartitionBy() const override; + bool supportsSubcolumns() const override; + + bool supportsSubsetOfColumns() const override; + + bool prefersLargeBlocks() const override; + + bool parallelizeOutputAfterReading(ContextPtr context) const override; + static SchemaCache & getSchemaCache(const ContextPtr & ctx); private: diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 11404602f58..e78fa185b17 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -134,3 +134,12 @@ def test_partition_by_const_column(cluster): azure_query(node, f"CREATE TABLE test_partitioned_const_write ({table_format}) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV') PARTITION BY {partition_by}") azure_query(node, f"INSERT INTO test_partitioned_const_write VALUES {values}") assert values_csv == get_azure_file_content("test_88.csv") + +def test_truncate(cluster): + node = cluster.instances["node"] + azure_query(node, "CREATE TABLE test_truncate (key UInt64, data String) Engine = Azure(azure_conf2, container='cont', blob_path='test_truncate.csv', format='CSV')") + azure_query(node, "INSERT INTO test_truncate VALUES (1, 'a')") + assert get_azure_file_content('test_truncate.csv') == '1,"a"\n' + azure_query(node, "TRUNCATE TABLE test_truncate") + with pytest.raises(Exception): + print(get_azure_file_content('test_truncate.csv')) From 638775e5802df2393c020d9409855ab58d7ff54b Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 5 Jun 2023 12:48:54 +0000 Subject: [PATCH 0478/1072] Analyzer: fix 01487_distributed_in_not_default_db --- tests/broken_tests.txt | 1 - .../0_stateless/01487_distributed_in_not_default_db.sql | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index 02935712325..da2493faa1e 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -37,7 +37,6 @@ 01319_optimize_skip_unused_shards_nesting 01353_low_cardinality_join_types 01455_shard_leaf_max_rows_bytes_to_read -01487_distributed_in_not_default_db 01495_subqueries_in_with_statement 01504_rocksdb 01527_dist_sharding_key_dictGet_reload diff --git a/tests/queries/0_stateless/01487_distributed_in_not_default_db.sql b/tests/queries/0_stateless/01487_distributed_in_not_default_db.sql index ccd2c571290..cd027530ac8 100644 --- a/tests/queries/0_stateless/01487_distributed_in_not_default_db.sql +++ b/tests/queries/0_stateless/01487_distributed_in_not_default_db.sql @@ -25,6 +25,10 @@ CREATE TABLE d AS t ENGINE = Distributed(test_cluster_two_shards_different_datab USE test_01487; DROP DATABASE test_01487; +-- After the default database is dropped QueryAnalysisPass cannot process the following SELECT query. +-- That query is invalid on the initiator node. +set allow_experimental_analyzer = 0; + SELECT * FROM main_01487.d WHERE value IN (SELECT l.value FROM l) ORDER BY value; USE main_01487; From c4e1dc55458cfc53ad3eee1406e6bac61bfe32d7 Mon Sep 17 00:00:00 2001 From: Daniel Kutenin Date: Mon, 5 Jun 2023 14:10:53 +0100 Subject: [PATCH 0479/1072] Enable FAST_DEC_LOOP for Arm LZ4 to get 5% of decomp speed It's disabled for clang because of mobile https://github.com/lz4/lz4/blob/e82198428c8061372d5adef1f9bfff4203f6081e/lib/lz4.c#L471 --- contrib/lz4-cmake/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/lz4-cmake/CMakeLists.txt b/contrib/lz4-cmake/CMakeLists.txt index 94def029410..c0fd574134f 100644 --- a/contrib/lz4-cmake/CMakeLists.txt +++ b/contrib/lz4-cmake/CMakeLists.txt @@ -12,6 +12,7 @@ add_library (_lz4 ${SRCS}) add_library (ch_contrib::lz4 ALIAS _lz4) target_compile_definitions (_lz4 PUBLIC LZ4_DISABLE_DEPRECATE_WARNINGS=1) +target_compile_definitions (_lz4 PUBLIC LZ4_FAST_DEC_LOOP=1) if (SANITIZE STREQUAL "undefined") target_compile_options (_lz4 PRIVATE -fno-sanitize=undefined) endif () From 4c88b7bbb725245060067b101593688f470ee399 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 5 Jun 2023 13:13:49 +0000 Subject: [PATCH 0480/1072] Further improve ANN index docs --- .../mergetree-family/annindexes.md | 152 +++++++++--------- 1 file changed, 72 insertions(+), 80 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 0cc1cff2dad..58655c11321 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -22,15 +22,53 @@ ORDER BY L2Distance(column, Point) LIMIT N ``` -The queries are expensive because the L2 distance (Euclidean distance) between all points in `column` and `Point` must be computed. To speed this process up, ANN indexes store a compact representation of the search space (using clustering, search trees, etc.) which allows to compute an approximate answer quickly. +The queries are expensive because the L2 (Euclidean) distance between `Point` and all points in `column` and must be computed. To speed this process up, Approximate Nearest Neighbor Search Indexes (ANN indexes) store a compact representation of the search space (using clustering, search trees, etc.) which allows to compute an approximate answer quickly. -## Indexes Structure +# Creating ANN Indexes -Approximate Nearest Neighbor Search Indexes (or `ANNIndexes`) are similar to skip indexes. They are constructed over granules and determine which granules can be skipped. Compared to skip indices, ANN indices are not only able to skip granules, they can also to select particular granules from a set of granules. +As long as ANN indexes are experimental, you first need to `SET allow_experimental_annoy_index = 1`. -`ANNIndexes` support two types of queries: +Syntax to create an ANN index over an `Array` column: + +```sql +CREATE TABLE table +( + `id` Int64, + `embedding` Array(Float32), + INDEX embedding TYPE () GRANULARITY +) +ENGINE = MergeTree +ORDER BY id; +``` + +Syntax to create an ANN index over a `Tuple` column: + +```sql +CREATE TABLE table +( + `id` Int64, + `embedding` Tuple(Float32[, Float32[, ...]]), + INDEX embedding TYPE () GRANULARITY +) +ENGINE = MergeTree +ORDER BY id; +``` + +ANN indexes are built during column insertion and merge and `INSERT` and `OPTIMIZE` statements will slower than for ordinary tables. ANNIndexes are ideally used only with immutable or rarely changed data, respectively comparatively many more read requests than write requests. + +Similar to regular skip indexes, ANN indexes are constructed over granules and each indexed block consists of `GRANULARITY = `-many +granules. For example, if the primary index granularity of the table is 8192 (setting `index_granularity = 8192`) and `GRANULARITY = 2`, +then each indexed block will consist of 16384 rows. However, unlike skip indexes, ANN indexes are not only able to skip the entire indexed +block, they are able to skip individual granules in indexed blocks. As a result, the `GRANULARITY` parameter has a different meaning in ANN +indexes than in normal skip indexes. Basically, the bigger `GRANULARITY` is chosen, the more data is provided to a single ANN index, and the +higher the chance that with the right hyper parameters, the index will remember the data structure better. + +# Using ANN Indexes + +ANN indexes support two types of queries: - WHERE queries: + ``` sql SELECT * FROM table @@ -39,86 +77,40 @@ Approximate Nearest Neighbor Search Indexes (or `ANNIndexes`) are similar to ski ``` - ORDER BY queries: + ``` sql SELECT * - FROM table [WHERE ...] + FROM table + [WHERE ...] ORDER BY DistanceFunction(column, Point) LIMIT N ``` -`DistanceFunction` is a [distance functions](/docs/en/sql-reference/functions/distance-functions.md), `Point` is a given vector (e.g. `(0.17, 0.33, ...)`) and `MaxDistance` is a float value which restricts the size of the neighbourhood. +`DistanceFunction` is a [distance function](/docs/en/sql-reference/functions/distance-functions.md), `Point` is a reference vector (e.g. `(0.17, 0.33, ...)`) and `MaxDistance` is a floating point value which restricts the size of the neighbourhood. -To avoid writing large vectors, you can also use [query parameters](/docs/en//interfaces/cli.md#queries-with-parameters-cli-queries-with-parameters), e.g. +:::tip +To avoid writing out large vectors, you can use [query parameters](/docs/en//interfaces/cli.md#queries-with-parameters-cli-queries-with-parameters), e.g. ```bash clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Distance(embedding, {vec: Array(Float32)}) < 1.0" ``` +::: -ANN index cannot speed up query that contain both `WHERE` and `ORDER BY`. Queries must have a limit, as the approximate algorithms used to determine the nearest neighbors require a specific number of them. +ANN indexes cannot speed up queries that contain both a `WHERE DistanceFunction(column, Point) < MaxDistance` and an `ORDER BY DistanceFunction(column, Point)` clause. Also, the approximate algorithms used to determine the nearest neighbors require a limit, hence queries that use an ANN index must have a `LIMIT` clause. -Indexes are only used for queries with a `LIMIT` value smaller than setting `max_limit_for_ann_queries` (default: 1 million rows). This helps to prevent memory overflows in queries with a large limit. +An ANN index is only used if the query has a `LIMIT` value smaller than setting `max_limit_for_ann_queries` (default: 1 million rows). This is a safety measure which helps to avoid large memory consumption by external libraries for approximate neighbor search. -Both types of queries are processed similarly. The indexes are passed the number of neighbors `N`. In `ORDER BY` query they remember the numbers of all parts of the granule that have at least one of neighbor. In `WHERE` query they remember only those parts that satisfy the requirements. - - -## Creating Tables with an ANN Index - -As long as ANN indexes are experimental, you first need to `SET allow_experimental_annoy_index = 1`. - -Syntax: - -```sql -CREATE TABLE table -( - `id` Int64, - `embedding` Tuple(Float32, Float32, Float32), - INDEX embedding TYPE () GRANULARITY N -) -ENGINE = MergeTree -ORDER BY id; -``` - -```sql -CREATE TABLE table -( - `id` Int64, - `embedding` Array(Float32), - INDEX embedding TYPE () GRANULARITY N -) -ENGINE = MergeTree -ORDER BY id; -``` - -With greater `GRANULARITY` indexes remember the data structure better. The `GRANULARITY` indicates how many granules will be used to construct the index. The more data is provided for the index, the more of it can be handled by one index and the more chances that with the right hyper parameters the index will remember the data structure better. But some indexes can't be built if they don't have enough data, so this granule will always participate in the query. For more information, see the description of indexes. - -Note that ANN indexes are built during column insertion and merge, i.e. `INSERT` and `OPTIMIZE` statements are slower than for ordinary tables. ANNIndexes are ideally used only with immutable or rarely changing data in conjunction with many read requests. - -# Index list +# Available ANN Indexes - [Annoy](/docs/en/engines/table-engines/mergetree-family/annindexes.md#annoy-annoy) -# Annoy {#annoy} +## Annoy {#annoy} -(currently disabled on ARM due to problems with the algorithm) +(currently disabled on ARM due to memory safety problems with the algorithm) -This ANN index type implements [Annoy indexes](https://github.com/spotify/annoy). +This type of ANN index implements [the Annoy algorithm](https://github.com/spotify/annoy) which uses a recursive division of the space in random linear surfaces (lines in 2D, planes in 3D etc.). -Short description of the algorithm: -The algorithm recursively divides in half all space by random linear surfaces (lines in 2D, planes in 3D etc.). Thus it makes tree of polyhedrons and points that they contains. Repeating the operation several times for greater accuracy it creates a forest. -To find K Nearest Neighbours it goes down through the trees and fills the buffer of closest points using the priority queue of polyhedrons. Next, it sorts buffer and return the nearest K points. - -Examples: - -```sql -CREATE TABLE table -( - id Int64, - embedding Tuple(Float32, Float32, Float32), - INDEX embedding TYPE annoy([DistanceName[, NumTrees]]) GRANULARITY N -) -ENGINE = MergeTree -ORDER BY id; -``` +Syntax to create a Annoy index over a `Array` column: ```sql CREATE TABLE table @@ -131,26 +123,26 @@ ENGINE = MergeTree ORDER BY id; ``` +Syntax to create a Annoy index over a `Tuple` column: + +```sql +CREATE TABLE table +( + id Int64, + embedding Tuple(Float32[, Float32[, ...]]), + INDEX embedding TYPE annoy([DistanceName[, NumTrees]]) GRANULARITY N +) +ENGINE = MergeTree +ORDER BY id; +``` + +Parameter `DistanceName` is name of a distance function (default `L2Distance`). Annoy currently supports `L2Distance` and `cosineDistance` as distance functions. Parameter `NumTrees` (default: 100) is the number of trees which the algorithm will create. Higher values of `NumTree` mean slower `CREATE` and `SELECT` statements (approximately linearly), but increase the accuracy of search results. + :::note Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays **must** have same length. Use [CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints) to avoid errors. For example, `CONSTRAINT constraint_name_1 CHECK length(embedding) = 256`. ::: -Parameter `DistanceName` is name of a distance function with default `L2Distance`. Parameter `NumTrees` (default: 100) is the number of trees which the algorithm will create. Higher values of `NumTree` mean slower `CREATE` and `SELECT` statements (approximately linearly), but increase the accuracy of search results. - -```sql -CREATE TABLE table -( - id Int64, - embedding Array(Float32), - INDEX ann_index_name embedding TYPE annoy('cosineDistance') GRANULARITY N -) -ENGINE = MergeTree -ORDER BY id; -``` - -Annoy currently supports `L2Distance` and `cosineDistance` as distance functions. - -Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many nodes are inspected during SELECTs. It can be used to +Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many tree nodes are inspected during SELECTs. It can be used to balance runtime and accuracy at runtime. Example: From 5fb4f1fc614a749bd0706e5dcd952224e821bf77 Mon Sep 17 00:00:00 2001 From: ltrk2 <107155950+ltrk2@users.noreply.github.com> Date: Mon, 5 Jun 2023 06:55:34 -0700 Subject: [PATCH 0481/1072] Implement review comments --- .../ReservoirSamplerDeterministic.h | 2 +- src/Common/TransformEndianness.hpp | 35 ++++++++++++------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/AggregateFunctions/ReservoirSamplerDeterministic.h b/src/AggregateFunctions/ReservoirSamplerDeterministic.h index 9dea821e839..b1a39a5dcc5 100644 --- a/src/AggregateFunctions/ReservoirSamplerDeterministic.h +++ b/src/AggregateFunctions/ReservoirSamplerDeterministic.h @@ -178,7 +178,7 @@ public: void write(DB::WriteBuffer & buf) const { - const auto size = samples.size(); + const size_t size = samples.size(); writeBinaryLittleEndian(size, buf); writeBinaryLittleEndian(total_values, buf); diff --git a/src/Common/TransformEndianness.hpp b/src/Common/TransformEndianness.hpp index 17cf441d17f..228490d24a1 100644 --- a/src/Common/TransformEndianness.hpp +++ b/src/Common/TransformEndianness.hpp @@ -8,18 +8,35 @@ namespace DB { template - requires is_big_int_v +requires std::is_integral_v +inline void transformEndianness(T & value) +{ + if constexpr (endian != std::endian::native) + value = std::byteswap(value); +} + +template +requires is_big_int_v inline void transformEndianness(T & x) { if constexpr (std::endian::native != endian) { - std::ranges::transform(x.items, std::begin(x.items), [](auto& item) { return std::byteswap(item); }); - std::ranges::reverse(x.items); + auto & items = x.items; + std::transform(std::begin(items), std::end(items), std::begin(items), [](auto & item) { return std::byteswap(item); }); + std::reverse(std::begin(items), std::end(items)); } } template - requires is_decimal || std::is_floating_point_v +requires is_decimal +inline void transformEndianness(T & x) +{ + if constexpr (std::endian::native != endian) + transformEndianness(x.value); +} + +template +requires std::is_floating_point_v inline void transformEndianness(T & value) { if constexpr (std::endian::native != endian) @@ -30,15 +47,7 @@ inline void transformEndianness(T & value) } template - requires std::is_integral_v && (sizeof(T) <= 8) -inline void transformEndianness(T & value) -{ - if constexpr (endian != std::endian::native) - value = std::byteswap(value); -} - -template - requires std::is_scoped_enum_v +requires std::is_scoped_enum_v inline void transformEndianness(T & x) { using UnderlyingType = std::underlying_type_t; From 50430ed304d8a4d4ce2bafc66bc1b7b74afec678 Mon Sep 17 00:00:00 2001 From: ltrk2 <107155950+ltrk2@users.noreply.github.com> Date: Mon, 5 Jun 2023 06:55:52 -0700 Subject: [PATCH 0482/1072] Configure rule for concepts requires clause --- .clang-format | 1 + 1 file changed, 1 insertion(+) diff --git a/.clang-format b/.clang-format index 2da3911dced..893d9c613f1 100644 --- a/.clang-format +++ b/.clang-format @@ -74,6 +74,7 @@ ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 DerivePointerAlignment: false DisableFormat: false +IndentRequiresClause: false IndentWidth: 4 IndentWrappedFunctionNames: false MacroBlockBegin: '' From 2b3db1d33c8be5dcfef3ba189a292d469f0f9676 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 5 Jun 2023 17:05:06 +0200 Subject: [PATCH 0483/1072] Update Metadata.cpp --- src/Interpreters/Cache/Metadata.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/Cache/Metadata.cpp b/src/Interpreters/Cache/Metadata.cpp index fea552c4071..5b6561a665e 100644 --- a/src/Interpreters/Cache/Metadata.cpp +++ b/src/Interpreters/Cache/Metadata.cpp @@ -346,7 +346,7 @@ void LockedKey::removeAllReleasable() ++it; continue; } - else if (it->second.evicting()) + else if (it->second->evicting()) { /// File segment is currently a removal candidate, /// we do not know if it will be removed or not yet, From 3938309374cef05afb618c36d932bd380abb1651 Mon Sep 17 00:00:00 2001 From: ltrk2 <107155950+ltrk2@users.noreply.github.com> Date: Mon, 5 Jun 2023 08:18:03 -0700 Subject: [PATCH 0484/1072] Implement review comments --- .../Serializations/SerializationUUID.cpp | 2 +- src/IO/ReadHelpers.cpp | 27 ++++++++++--------- src/IO/ReadHelpers.h | 1 - src/IO/WriteHelpers.cpp | 25 ++++++++--------- src/IO/WriteHelpers.h | 7 +++-- .../Formats/Impl/AvroRowInputFormat.cpp | 2 +- .../Formats/Impl/AvroRowOutputFormat.cpp | 4 +-- 7 files changed, 35 insertions(+), 33 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationUUID.cpp b/src/DataTypes/Serializations/SerializationUUID.cpp index 13313111b2b..76be273d7dc 100644 --- a/src/DataTypes/Serializations/SerializationUUID.cpp +++ b/src/DataTypes/Serializations/SerializationUUID.cpp @@ -51,7 +51,7 @@ void SerializationUUID::deserializeTextQuoted(IColumn & column, ReadBuffer & ist { assertChar('\'', istr); char * next_pos = find_first_symbols<'\\', '\''>(istr.position(), istr.buffer().end()); - const auto len = next_pos - istr.position(); + const size_t len = next_pos - istr.position(); if ((len == 32 || len == 36) && istr.position()[len] == '\'') { uuid = parseUUID(std::span(reinterpret_cast(istr.position()), len)); diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index a85a057f2b3..99b3e4b514b 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -31,6 +31,7 @@ namespace ErrorCodes extern const int CANNOT_PARSE_QUOTED_STRING; extern const int CANNOT_PARSE_DATETIME; extern const int CANNOT_PARSE_DATE; + extern const int CANNOT_PARSE_UUID; extern const int INCORRECT_DATA; extern const int ATTEMPT_TO_READ_AFTER_EOF; extern const int LOGICAL_ERROR; @@ -51,33 +52,35 @@ UUID parseUUID(std::span src) UUID uuid; const auto * src_ptr = src.data(); auto * dst = reinterpret_cast(&uuid); - if (const auto size = src.size(); size == 36) + const auto size = src.size(); + if (size == 36) { -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - parseHex<4>(src_ptr, dst); - parseHex<2>(src_ptr + 9, dst + 4); - parseHex<2>(src_ptr + 14, dst + 6); - parseHex<2>(src_ptr + 19, dst + 8); - parseHex<6>(src_ptr + 24, dst + 10); -#else +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ const std::reverse_iterator dst_it(dst + sizeof(UUID)); - /// FIXME This code looks like trash. parseHex<4>(src_ptr, dst + 8); parseHex<2>(src_ptr + 9, dst + 12); parseHex<2>(src_ptr + 14, dst + 14); parseHex<2>(src_ptr + 19, dst); parseHex<6>(src_ptr + 24, dst + 2); +#else + parseHex<4>(src_ptr, dst); + parseHex<2>(src_ptr + 9, dst + 4); + parseHex<2>(src_ptr + 14, dst + 6); + parseHex<2>(src_ptr + 19, dst + 8); + parseHex<6>(src_ptr + 24, dst + 10); #endif } else if (size == 32) { -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - parseHex<16>(src_ptr, dst); -#else +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ parseHex<8>(src_ptr, dst + 8); parseHex<8>(src_ptr + 16, dst); +#else + parseHex<16>(src_ptr, dst); #endif } + else + throw Exception(ErrorCodes::CANNOT_PARSE_UUID, "Unexpected length when trying to parse UUID ({})", size); return uuid; } diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 7e293944d19..804dab16db9 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -765,7 +765,6 @@ inline bool tryReadDateText(ExtendedDayNum & date, ReadBuffer & buf) return readDateTextImpl(date, buf); } -/// If string is not like UUID - implementation specific behaviour. UUID parseUUID(std::span src); template diff --git a/src/IO/WriteHelpers.cpp b/src/IO/WriteHelpers.cpp index 6023d4c9d5b..4f1a95181d4 100644 --- a/src/IO/WriteHelpers.cpp +++ b/src/IO/WriteHelpers.cpp @@ -20,25 +20,12 @@ void formatHex(IteratorSrc src, IteratorDst dst, size_t num_bytes) } } -/** Function used when byte ordering is important when parsing uuid - * ex: When we create an UUID type - */ std::array formatUUID(const UUID & uuid) { std::array dst; const auto * src_ptr = reinterpret_cast(&uuid); auto * dst_ptr = dst.data(); -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - formatHex(src_ptr, dst_ptr, 4); - dst[8] = '-'; - formatHex(src_ptr + 4, dst_ptr + 9, 2); - dst[13] = '-'; - formatHex(src_ptr + 6, dst_ptr + 14, 2); - dst[18] = '-'; - formatHex(src_ptr + 8, dst_ptr + 19, 2); - dst[23] = '-'; - formatHex(src_ptr + 10, dst_ptr + 24, 6); -#else +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ const std::reverse_iterator src_it(src_ptr + 16); formatHex(src_it + 8, dst_ptr, 4); dst[8] = '-'; @@ -49,6 +36,16 @@ std::array formatUUID(const UUID & uuid) formatHex(src_it, dst_ptr + 19, 2); dst[23] = '-'; formatHex(src_it + 2, dst_ptr + 24, 6); +#else + formatHex(src_ptr, dst_ptr, 4); + dst[8] = '-'; + formatHex(src_ptr + 4, dst_ptr + 9, 2); + dst[13] = '-'; + formatHex(src_ptr + 6, dst_ptr + 14, 2); + dst[18] = '-'; + formatHex(src_ptr + 8, dst_ptr + 19, 2); + dst[23] = '-'; + formatHex(src_ptr + 10, dst_ptr + 24, 6); #endif return dst; diff --git a/src/IO/WriteHelpers.h b/src/IO/WriteHelpers.h index 923684c4249..056c2ca1b50 100644 --- a/src/IO/WriteHelpers.h +++ b/src/IO/WriteHelpers.h @@ -625,12 +625,15 @@ inline void writeXMLStringForTextElement(std::string_view s, WriteBuffer & buf) writeXMLStringForTextElement(s.data(), s.data() + s.size(), buf); } +/// @brief Serialize `uuid` into an array of characters in big-endian byte order. +/// @param uuid UUID to serialize. +/// @return Array of characters in big-endian byte order. std::array formatUUID(const UUID & uuid); inline void writeUUIDText(const UUID & uuid, WriteBuffer & buf) { - const auto text = formatUUID(uuid); - buf.write(text.data(), text.size()); + const auto serialized_uuid = formatUUID(uuid); + buf.write(serialized_uuid.data(), serialized_uuid.size()); } void writeIPv4Text(const IPv4 & ip, WriteBuffer & buf); diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 974b198a483..a4d4e374f4f 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -256,7 +256,7 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(const avro if (tmp.length() != 36) throw ParsingException(ErrorCodes::CANNOT_PARSE_UUID, "Cannot parse uuid {}", tmp); - const auto uuid = parseUUID({reinterpret_cast(tmp.data()), tmp.length()}); + const UUID uuid = parseUUID({reinterpret_cast(tmp.data()), tmp.length()}); assert_cast(column).insertValue(uuid); return true; }; diff --git a/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp index 2b163164d56..f0985e7cffc 100644 --- a/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -329,8 +329,8 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF return {schema, [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { const auto & uuid = assert_cast(column).getElement(row_num); - const auto text = formatUUID(uuid); - encoder.encodeBytes(reinterpret_cast(text.data()), text.size()); + const auto serialized_uuid = formatUUID(uuid); + encoder.encodeBytes(reinterpret_cast(serialized_uuid.data()), serialized_uuid.size()); }}; } case TypeIndex::Array: From 33e51d4f3b25aa1af5dedf751b5fec5229dc6eac Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 5 Jun 2023 15:22:04 +0000 Subject: [PATCH 0485/1072] Add setting to limit the number of bytes to read in schema inference --- docs/en/interfaces/schema-inference.md | 16 +++++++------ .../operations/settings/settings-formats.md | 6 +++++ src/Core/Settings.h | 1 + src/Formats/EscapingRuleUtils.cpp | 3 ++- src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 3 ++- src/Formats/ReadSchemaUtils.cpp | 14 +++++++---- src/Processors/Formats/ISchemaReader.cpp | 24 +++++++++++-------- src/Processors/Formats/ISchemaReader.h | 9 +++++-- .../Impl/JSONColumnsBlockInputFormat.cpp | 2 +- .../Impl/JSONColumnsBlockInputFormatBase.cpp | 12 ++++++---- .../Impl/JSONColumnsBlockInputFormatBase.h | 16 +++++++++++-- .../JSONCompactColumnsBlockInputFormat.cpp | 2 +- ...ytes_to_read_in_schema_inference.reference | 1 + ..._max_bytes_to_read_in_schema_inference.sql | 4 ++++ 15 files changed, 80 insertions(+), 34 deletions(-) create mode 100644 tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.reference create mode 100644 tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md index c448d0aee47..a757a032b7d 100644 --- a/docs/en/interfaces/schema-inference.md +++ b/docs/en/interfaces/schema-inference.md @@ -329,8 +329,8 @@ SELECT count() FROM system.schema_inference_cache WHERE storage='S3' ## Text formats {#text-formats} For text formats, ClickHouse reads the data row by row, extracts column values according to the format, -and then uses some recursive parsers and heuristics to determine the type for each value. The maximum number of rows read from the data in schema inference -is controlled by the setting `input_format_max_rows_to_read_for_schema_inference` with default value 25000. +and then uses some recursive parsers and heuristics to determine the type for each value. The maximum number of rows and bytes read from the data in schema inference +is controlled by the settings `input_format_max_rows_to_read_for_schema_inference` (25000 by default) and `input_format_max_bytes_to_read_for_schema_inference` (32Mb by default). By default, all inferred types are [Nullable](../sql-reference/data-types/nullable.md), but you can change this by setting `schema_inference_make_columns_nullable` (see examples in the [settings](#settings-for-text-formats) section). ### JSON formats {#json-formats} @@ -1144,13 +1144,15 @@ Line: value_1=2, value_2="Some string 2", value_3="[4, 5, NULL]"$$) ### Settings for text formats {#settings-for-text-formats} -#### input_format_max_rows_to_read_for_schema_inference +#### input_format_max_rows_to_read_for_schema_inference/input_format_max_bytes_to_read_for_schema_inference -This setting controls the maximum number of rows to be read while schema inference. -The more rows are read, the more time is spent on schema inference, but the greater the chance to +These settings control the amount of data to be read while schema inference. +The more rows/bytes are read, the more time is spent on schema inference, but the greater the chance to correctly determine the types (especially when the data contains a lot of nulls). -Default value: `25000`. +Default values: +- `25000` for `input_format_max_rows_to_read_for_schema_inference`. +- `33554432` (32 Mb) for `input_format_max_bytes_to_read_for_schema_inference`. #### column_names_for_schema_inference @@ -1623,7 +1625,7 @@ In schema inference for CapnProto format ClickHouse uses the following type matc ## Strong-typed binary formats {#strong-typed-binary-formats} In such formats, each serialized value contains information about its type (and possibly about its name), but there is no information about the whole table. -In schema inference for such formats, ClickHouse reads data row by row (up to `input_format_max_rows_to_read_for_schema_inference` rows) and extracts +In schema inference for such formats, ClickHouse reads data row by row (up to `input_format_max_rows_to_read_for_schema_inference` rows or `input_format_max_bytes_to_read_for_schema_inference` bytes) and extracts the type (and possibly name) for each value from the data and then converts these types to ClickHouse types. ### MsgPack {#msgpack} diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 65038d3a256..e4a8c916bcf 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -137,6 +137,12 @@ The maximum rows of data to read for automatic schema inference. Default value: `25'000`. +## input_format_max_bytes_to_read_for_schema_inference {#input_format_max_bytes_to_read_for_schema_inference} + +The maximum amount of data in bytes to read for automatic schema inference. + +Default value: `33554432` (32 Mb). + ## column_names_for_schema_inference {#column_names_for_schema_inference} The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...' diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 67c92a0be8b..f1e6c518f30 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -844,6 +844,7 @@ class IColumn; M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \ M(MsgPackUUIDRepresentation, output_format_msgpack_uuid_representation, FormatSettings::MsgPackUUIDRepresentation::EXT, "The way how to output UUID in MsgPack format.", 0) \ M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, "The maximum rows of data to read for automatic schema inference", 0) \ + M(UInt64, input_format_max_bytes_to_read_for_schema_inference, 32 * 1024 * 1024, "The maximum bytes of data to read for automatic schema inference", 0) \ M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \ M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \ M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \ diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 804f32e4b46..9f744218da2 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -408,9 +408,10 @@ DataTypes getDefaultDataTypeForEscapingRules(const std::vectormax_rows_to_read_for_schema_inference : context->getSettingsRef().input_format_max_rows_to_read_for_schema_inference; + size_t max_bytes_to_read = format_settings ? format_settings->max_bytes_to_read_for_schema_inference + : context->getSettingsRef().input_format_max_bytes_to_read_for_schema_inference; size_t iterations = 0; ColumnsDescription cached_columns; while (true) @@ -120,7 +122,7 @@ ColumnsDescription readSchemaFromFormat( try { schema_reader = FormatFactory::instance().getSchemaReader(format_name, *buf, context, format_settings); - schema_reader->setMaxRowsToRead(max_rows_to_read); + schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); names_and_types = schema_reader->readSchema(); break; } @@ -132,10 +134,14 @@ ColumnsDescription readSchemaFromFormat( size_t rows_read = schema_reader->getNumRowsRead(); assert(rows_read <= max_rows_to_read); max_rows_to_read -= schema_reader->getNumRowsRead(); - if (rows_read != 0 && max_rows_to_read == 0) + size_t bytes_read = buf->count(); + /// We could exceed max_bytes_to_read a bit to complete row parsing. + max_bytes_to_read -= std::min(bytes_read, max_bytes_to_read); + if (rows_read != 0 && (max_rows_to_read == 0 || max_bytes_to_read == 0)) { - exception_message += "\nTo increase the maximum number of rows to read for structure determination, use setting " - "input_format_max_rows_to_read_for_schema_inference"; + exception_message += "\nTo increase the maximum number of rows/bytes to read for structure determination, use setting " + "input_format_max_rows_to_read_for_schema_inference/input_format_max_bytes_to_read_for_schema_inference"; + if (iterations > 1) { exception_messages += "\n" + exception_message; diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index c96cb373a2d..9f26a3543d0 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -57,11 +57,15 @@ void checkFinalInferredType( } IIRowSchemaReader::IIRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_) - : ISchemaReader(in_), default_type(default_type_), hints_str(format_settings_.schema_inference_hints), format_settings(format_settings_) + : ISchemaReader(in_) + , max_rows_to_read(format_settings_.max_rows_to_read_for_schema_inference) + , max_bytes_to_read(format_settings_.max_bytes_to_read_for_schema_inference) + , default_type(default_type_) + , hints_str(format_settings_.schema_inference_hints) + , format_settings(format_settings_) { } - void IIRowSchemaReader::setContext(ContextPtr & context) { ColumnsDescription columns; @@ -99,11 +103,11 @@ IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & form NamesAndTypesList IRowSchemaReader::readSchema() { - if (max_rows_to_read == 0) + if (max_rows_to_read == 0 || max_bytes_to_read == 0) throw Exception( ErrorCodes::BAD_ARGUMENTS, - "Cannot read rows to determine the schema, the maximum number of rows to read is set to 0. " - "Most likely setting input_format_max_rows_to_read_for_schema_inference is set to 0"); + "Cannot read rows to determine the schema, the maximum number of rows (or bytes) to read is set to 0. " + "Most likely setting input_format_max_rows_to_read_for_schema_inference or input_format_max_bytes_to_read_for_schema_inference is set to 0"); DataTypes data_types = readRowAndGetDataTypes(); @@ -143,7 +147,7 @@ NamesAndTypesList IRowSchemaReader::readSchema() data_types[i] = hint_it->second; } - for (rows_read = 1; rows_read < max_rows_to_read; ++rows_read) + for (rows_read = 1; rows_read < max_rows_to_read && in.count() < max_bytes_to_read; ++rows_read) { DataTypes new_data_types = readRowAndGetDataTypes(); if (new_data_types.empty()) @@ -220,11 +224,11 @@ IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, const For NamesAndTypesList IRowWithNamesSchemaReader::readSchema() { - if (max_rows_to_read == 0) + if (max_rows_to_read == 0 || max_bytes_to_read == 0) throw Exception( ErrorCodes::BAD_ARGUMENTS, - "Cannot read rows to determine the schema, the maximum number of rows to read is set to 0. " - "Most likely setting input_format_max_rows_to_read_for_schema_inference is set to 0"); + "Cannot read rows to determine the schema, the maximum number of rows (or bytes) to read is set to 0. " + "Most likely setting input_format_max_rows_to_read_for_schema_inference or input_format_max_bytes_to_read_for_schema_inference is set to 0"); bool eof = false; auto names_and_types = readRowAndGetNamesAndDataTypes(eof); @@ -245,7 +249,7 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema() names_order.push_back(name); } - for (rows_read = 1; rows_read < max_rows_to_read; ++rows_read) + for (rows_read = 1; rows_read < max_rows_to_read && in.count() < max_bytes_to_read; ++rows_read) { auto new_names_and_types = readRowAndGetNamesAndDataTypes(eof); if (eof) diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index 78b34a07840..40702198a57 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -32,7 +32,7 @@ public: virtual bool needContext() const { return false; } virtual void setContext(ContextPtr &) {} - virtual void setMaxRowsToRead(size_t) {} + virtual void setMaxRowsAndBytesToRead(size_t, size_t) {} virtual size_t getNumRowsRead() const { return 0; } virtual ~ISchemaReader() = default; @@ -54,12 +54,17 @@ public: virtual void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type); protected: - void setMaxRowsToRead(size_t max_rows) override { max_rows_to_read = max_rows; } + void setMaxRowsAndBytesToRead(size_t max_rows, size_t max_bytes) override + { + max_rows_to_read = max_rows; + max_bytes_to_read = max_bytes; + } size_t getNumRowsRead() const override { return rows_read; } virtual void transformFinalTypeIfNeeded(DataTypePtr &) {} size_t max_rows_to_read; + size_t max_bytes_to_read; size_t rows_read = 0; DataTypePtr default_type; String hints_str; diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormat.cpp b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormat.cpp index 8d4c4b0c6cf..3d003658e64 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormat.cpp @@ -55,7 +55,7 @@ void registerJSONColumnsSchemaReader(FormatFactory & factory) ); factory.registerAdditionalInfoForSchemaCacheGetter("JSONColumns", [](const FormatSettings & settings) { - return getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON); + return getAdditionalFormatInfoForAllRowBasedFormats(settings) + getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON); }); } diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp index 2e264c59f56..84a07ebc8fb 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp @@ -176,6 +176,8 @@ JSONColumnsSchemaReaderBase::JSONColumnsSchemaReaderBase( , hints_str(format_settings_.schema_inference_hints) , reader(std::move(reader_)) , column_names_from_settings(splitColumnNames(format_settings_.column_names_for_schema_inference)) + , max_rows_to_read(format_settings_.max_rows_to_read_for_schema_inference) + , max_bytes_to_read(format_settings_.max_bytes_to_read_for_schema_inference) { } @@ -196,12 +198,12 @@ void JSONColumnsSchemaReaderBase::transformTypesIfNeeded(DataTypePtr & type, Dat NamesAndTypesList JSONColumnsSchemaReaderBase::readSchema() { - size_t total_rows_read = 0; std::unordered_map names_to_types; std::vector names_order; /// Read data block by block and determine the type for each column - /// until max_rows_to_read_for_schema_inference is reached. - while (total_rows_read < format_settings.max_rows_to_read_for_schema_inference) + /// until max_rows_to_read/max_bytes_to_read is reached. + /// Note that we can exceed max_bytes_to_read to compete block parsing. + while (total_rows_read < max_rows_to_read && in.count() < max_bytes_to_read) { if (in.eof()) break; @@ -268,7 +270,7 @@ NamesAndTypesList JSONColumnsSchemaReaderBase::readSchema() return result; } -DataTypePtr JSONColumnsSchemaReaderBase::readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows_to_read) +DataTypePtr JSONColumnsSchemaReaderBase::readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows) { /// Check for empty column. if (reader->checkColumnEnd()) @@ -279,7 +281,7 @@ DataTypePtr JSONColumnsSchemaReaderBase::readColumnAndGetDataType(const String & do { /// If we reached max_rows_to_read, skip the rest part of this column. - if (rows_read == max_rows_to_read) + if (rows_read == max_rows) { reader->skipColumn(); break; diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h index 2babc0734f9..886c8841540 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h @@ -82,11 +82,19 @@ public: bool needContext() const override { return !hints_str.empty(); } void setContext(ContextPtr & ctx) override; + void setMaxRowsAndBytesToRead(size_t max_rows, size_t max_bytes) override + { + max_rows_to_read = max_rows; + max_bytes_to_read = max_bytes; + } + + size_t getNumRowsRead() const override { return total_rows_read; } + private: NamesAndTypesList readSchema() override; - /// Read whole column in the block (up to max_rows_to_read rows) and extract the data type. - DataTypePtr readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows_to_read); + /// Read whole column in the block (up to max_rows rows) and extract the data type. + DataTypePtr readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows); const FormatSettings format_settings; String hints_str; @@ -95,6 +103,10 @@ private: std::unique_ptr reader; Names column_names_from_settings; JSONInferenceInfo inference_info; + + size_t total_rows_read = 0; + size_t max_rows_to_read; + size_t max_bytes_to_read; }; } diff --git a/src/Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.cpp index ade18d21892..09df7beaa73 100644 --- a/src/Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.cpp @@ -53,7 +53,7 @@ void registerJSONCompactColumnsSchemaReader(FormatFactory & factory) ); factory.registerAdditionalInfoForSchemaCacheGetter("JSONCompactColumns", [](const FormatSettings & settings) { - auto result = getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON); + auto result = getAdditionalFormatInfoForAllRowBasedFormats(settings) + getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON); return result + fmt::format(", column_names_for_schema_inference={}", settings.column_names_for_schema_inference); }); } diff --git a/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.reference b/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.reference new file mode 100644 index 00000000000..d45098ddc0f --- /dev/null +++ b/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.reference @@ -0,0 +1 @@ +a Nullable(Int64) diff --git a/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql b/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql new file mode 100644 index 00000000000..9dbf176472d --- /dev/null +++ b/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql @@ -0,0 +1,4 @@ +set input_format_max_rows_to_read_for_schema_inference=2; +desc format('JSONEachRow', '{"a" : null}, {"a" : 42}') settings input_format_max_bytes_to_read_for_schema_inference=10; -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} +desc format('JSONEachRow', '{"a" : null}, {"a" : 42}') settings input_format_max_bytes_to_read_for_schema_inference=20; + From 79cbebaf0dcc98b947a78dcfa490493cf18f076b Mon Sep 17 00:00:00 2001 From: ltrk2 <107155950+ltrk2@users.noreply.github.com> Date: Mon, 5 Jun 2023 15:34:04 +0000 Subject: [PATCH 0486/1072] Remove unnecessary conditional expression --- src/Common/TransformEndianness.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Common/TransformEndianness.hpp b/src/Common/TransformEndianness.hpp index 228490d24a1..4d690d75d9e 100644 --- a/src/Common/TransformEndianness.hpp +++ b/src/Common/TransformEndianness.hpp @@ -31,8 +31,7 @@ template requires is_decimal inline void transformEndianness(T & x) { - if constexpr (std::endian::native != endian) - transformEndianness(x.value); + transformEndianness(x.value); } template From 0832cb2d7a3bdd2c5e4721c8d8d71a14ce4485ee Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 5 Jun 2023 17:51:12 +0200 Subject: [PATCH 0487/1072] Update Metadata.h --- src/Interpreters/Cache/Metadata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/Cache/Metadata.h b/src/Interpreters/Cache/Metadata.h index 4732123fabc..64f91595822 100644 --- a/src/Interpreters/Cache/Metadata.h +++ b/src/Interpreters/Cache/Metadata.h @@ -22,7 +22,7 @@ struct FileSegmentMetadata : private boost::noncopyable size_t size() const; - bool evicting() const { return !removal_candidate.load(); } + bool evicting() const { return removal_candidate.load(); } Priority::Iterator getQueueIterator() const { return file_segment->getQueueIterator(); } From 654aee209f616aeef350e39cc3c3909862fa14e2 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Mon, 5 Jun 2023 11:55:04 -0400 Subject: [PATCH 0488/1072] add video --- .../mergetree-family/invertedindexes.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/en/engines/table-engines/mergetree-family/invertedindexes.md b/docs/en/engines/table-engines/mergetree-family/invertedindexes.md index 31f5a87a2b6..db3d6d0a479 100644 --- a/docs/en/engines/table-engines/mergetree-family/invertedindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/invertedindexes.md @@ -15,6 +15,18 @@ tokenized cells of the string column. For example, the string cell "I will be a " wi", "wil", "ill", "ll ", "l b", " be" etc. The more fine-granular the input strings are tokenized, the bigger but also the more useful the resulting inverted index will be. +
+ +
+ :::note Inverted indexes are experimental and should not be used in production environments yet. They may change in the future in backward-incompatible ways, for example with respect to their DDL/DQL syntax or performance/compression characteristics. From 67af505ed63fc49d253f67f75b814dcf551e3a2c Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 5 Jun 2023 17:04:55 +0000 Subject: [PATCH 0489/1072] Respect setting input_format_as_default in schema inference --- src/Processors/Formats/ISchemaReader.cpp | 6 ++++++ .../02784_schema_inference_null_as_default.reference | 9 +++++++++ .../02784_schema_inference_null_as_default.sql | 7 +++++++ 3 files changed, 22 insertions(+) create mode 100644 tests/queries/0_stateless/02784_schema_inference_null_as_default.reference create mode 100644 tests/queries/0_stateless/02784_schema_inference_null_as_default.sql diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index c96cb373a2d..0cb6499f423 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,11 @@ void checkFinalInferredType( if (settings.schema_inference_make_columns_nullable) type = makeNullableRecursively(type); + /// In case when data for some column could contain nulls and regular values, + /// resulting inferred type is Nullable. + /// If input_format_null_as_default is enabled, we should remove Nullable type. + else if (settings.null_as_default) + type = removeNullable(type); } IIRowSchemaReader::IIRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_) diff --git a/tests/queries/0_stateless/02784_schema_inference_null_as_default.reference b/tests/queries/0_stateless/02784_schema_inference_null_as_default.reference new file mode 100644 index 00000000000..c83819ab2e2 --- /dev/null +++ b/tests/queries/0_stateless/02784_schema_inference_null_as_default.reference @@ -0,0 +1,9 @@ +x Nullable(Int64) +\N +42 +x Nullable(Int64) +\N +42 +x Int64 +0 +42 diff --git a/tests/queries/0_stateless/02784_schema_inference_null_as_default.sql b/tests/queries/0_stateless/02784_schema_inference_null_as_default.sql new file mode 100644 index 00000000000..9c9f99d8283 --- /dev/null +++ b/tests/queries/0_stateless/02784_schema_inference_null_as_default.sql @@ -0,0 +1,7 @@ +desc format(JSONEachRow, '{"x" : null}, {"x" : 42}') settings schema_inference_make_columns_nullable=1; +select * from format(JSONEachRow, '{"x" : null}, {"x" : 42}') settings schema_inference_make_columns_nullable=1; +desc format(JSONEachRow, '{"x" : null}, {"x" : 42}') settings schema_inference_make_columns_nullable=0, input_format_null_as_default=0; +select * from format(JSONEachRow, '{"x" : null}, {"x" : 42}') settings schema_inference_make_columns_nullable=0, input_format_null_as_default=0; +desc format(JSONEachRow, '{"x" : null}, {"x" : 42}') settings schema_inference_make_columns_nullable=0, input_format_null_as_default=1; +select * from format(JSONEachRow, '{"x" : null}, {"x" : 42}') settings schema_inference_make_columns_nullable=0, input_format_null_as_default=1; + From 028e48dfa716c7e7aa9f5e3df56adb563d653b02 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 5 Jun 2023 17:33:10 +0000 Subject: [PATCH 0490/1072] Update docs --- docs/en/interfaces/schema-inference.md | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md index c448d0aee47..bef858eaba0 100644 --- a/docs/en/interfaces/schema-inference.md +++ b/docs/en/interfaces/schema-inference.md @@ -1192,7 +1192,7 @@ DESC format(JSONEachRow, '{"id" : 1, "age" : 25, "name" : "Josh", "status" : nul #### schema_inference_make_columns_nullable Controls making inferred types `Nullable` in schema inference for formats without information about nullability. -If the setting is enabled, all inferred type will be `Nullable`, if disabled, the inferred type will be `Nullable` only if the column contains `NULL` in a sample that is parsed during schema inference. +If the setting is enabled, all inferred type will be `Nullable`, if disabled, the inferred type will be `Nullable` only if `input_format_null_as_default` is disabled and the column contains `NULL` in a sample that is parsed during schema inference. Enabled by default. @@ -1215,7 +1215,8 @@ DESC format(JSONEachRow, $$ └─────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ ``` ```sql -SET schema_inference_make_columns_nullable = 0 +SET schema_inference_make_columns_nullable = 0; +SET input_format_null_as_default = 0; DESC format(JSONEachRow, $$ {"id" : 1, "age" : 25, "name" : "Josh", "status" : null, "hobbies" : ["football", "cooking"]} {"id" : 2, "age" : 19, "name" : "Alan", "status" : "married", "hobbies" : ["tennis", "art"]} @@ -1232,6 +1233,25 @@ DESC format(JSONEachRow, $$ └─────────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ ``` +```sql +SET schema_inference_make_columns_nullable = 0; +SET input_format_null_as_default = 1; +DESC format(JSONEachRow, $$ + {"id" : 1, "age" : 25, "name" : "Josh", "status" : null, "hobbies" : ["football", "cooking"]} + {"id" : 2, "age" : 19, "name" : "Alan", "status" : "married", "hobbies" : ["tennis", "art"]} + $$) +``` +```response + +┌─name────┬─type──────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐ +│ id │ Int64 │ │ │ │ │ │ +│ age │ Int64 │ │ │ │ │ │ +│ name │ String │ │ │ │ │ │ +│ status │ String │ │ │ │ │ │ +│ hobbies │ Array(String) │ │ │ │ │ │ +└─────────┴───────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ +``` + #### input_format_try_infer_integers If enabled, ClickHouse will try to infer integers instead of floats in schema inference for text formats. From 35439a8b06501460fe9162e09eae0fa9b334d1a1 Mon Sep 17 00:00:00 2001 From: ltrk2 <107155950+ltrk2@users.noreply.github.com> Date: Mon, 5 Jun 2023 10:47:52 -0700 Subject: [PATCH 0491/1072] Use reverse iterator for little-endian version --- src/IO/ReadHelpers.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 99b3e4b514b..1bd67e240c9 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -53,15 +53,18 @@ UUID parseUUID(std::span src) const auto * src_ptr = src.data(); auto * dst = reinterpret_cast(&uuid); const auto size = src.size(); + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + const std::reverse_iterator dst_it(dst + sizeof(UUID)); +#endif if (size == 36) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - const std::reverse_iterator dst_it(dst + sizeof(UUID)); - parseHex<4>(src_ptr, dst + 8); - parseHex<2>(src_ptr + 9, dst + 12); - parseHex<2>(src_ptr + 14, dst + 14); - parseHex<2>(src_ptr + 19, dst); - parseHex<6>(src_ptr + 24, dst + 2); + parseHex<4>(src_ptr, dst_it + 8); + parseHex<2>(src_ptr + 9, dst_it + 12); + parseHex<2>(src_ptr + 14, dst_it + 14); + parseHex<2>(src_ptr + 19, dst_it); + parseHex<6>(src_ptr + 24, dst_it + 2); #else parseHex<4>(src_ptr, dst); parseHex<2>(src_ptr + 9, dst + 4); @@ -73,8 +76,8 @@ UUID parseUUID(std::span src) else if (size == 32) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - parseHex<8>(src_ptr, dst + 8); - parseHex<8>(src_ptr + 16, dst); + parseHex<8>(src_ptr, dst_it + 8); + parseHex<8>(src_ptr + 16, dst_it); #else parseHex<16>(src_ptr, dst); #endif From ad85faabd1941b891b406225ecf7c6b568c8328f Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Mon, 5 Jun 2023 16:09:53 +0000 Subject: [PATCH 0492/1072] Fix test --- tests/integration/test_storage_mongodb/test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_storage_mongodb/test.py b/tests/integration/test_storage_mongodb/test.py index e6e77c64515..6ce71fb91fa 100644 --- a/tests/integration/test_storage_mongodb/test.py +++ b/tests/integration/test_storage_mongodb/test.py @@ -71,6 +71,7 @@ def test_simple_select(started_cluster): simple_mongo_table.drop() +@pytest.mark.parametrize("started_cluster", [False], indirect=["started_cluster"]) def test_simple_select_from_view(started_cluster): mongo_connection = get_mongo_connection(started_cluster) db = mongo_connection["test"] @@ -86,7 +87,7 @@ def test_simple_select_from_view(started_cluster): node = started_cluster.instances["node"] node.query( - "CREATE TABLE simple_mongo_table(key UInt64, data String) ENGINE = MongoDB('mongo2:27017', 'test', 'simple_table_view', 'root', 'clickhouse')" + "CREATE TABLE simple_mongo_table(key UInt64, data String) ENGINE = MongoDB('mongo1:27017', 'test', 'simple_table_view', 'root', 'clickhouse')" ) assert node.query("SELECT COUNT() FROM simple_mongo_table") == "100\n" From e8c6c7967b38ebdd467fb9c04966419731b5a689 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 5 Jun 2023 18:21:40 +0000 Subject: [PATCH 0493/1072] Move attaching gdb to separate lib --- docker/test/stateless/run.sh | 41 +++-------------------------------- tests/ci/attach_gdb.lib | 42 ++++++++++++++++++++++++++++++++++++ tests/ci/stress_tests.lib | 41 +++-------------------------------- 3 files changed, 48 insertions(+), 76 deletions(-) create mode 100644 tests/ci/attach_gdb.lib diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index dfee7d84cde..df650b37cc6 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -15,6 +15,8 @@ dpkg -i package_folder/clickhouse-client_*.deb ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test +source /usr/share/clickhouse-test/ci/attach_gdb.lib + # install test configs /usr/share/clickhouse-test/config/install.sh @@ -85,44 +87,7 @@ fi sleep 5 -# Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog -# and clickhouse-server can do fork-exec, for example, to run some bridge. -# Do not set nostop noprint for all signals, because some it may cause gdb to hang, -# explicitly ignore non-fatal signals that are used by server. -# Number of SIGRTMIN can be determined only in runtime. -RTMIN=$(kill -l SIGRTMIN) -echo " -set follow-fork-mode parent -handle SIGHUP nostop noprint pass -handle SIGINT nostop noprint pass -handle SIGQUIT nostop noprint pass -handle SIGPIPE nostop noprint pass -handle SIGTERM nostop noprint pass -handle SIGUSR1 nostop noprint pass -handle SIGUSR2 nostop noprint pass -handle SIG$RTMIN nostop noprint pass -info signals -continue -backtrace full -thread apply all backtrace full -info registers -disassemble /s -up -disassemble /s -up -disassemble /s -p \"done\" -detach -quit -" > script.gdb - -# FIXME Hung check may work incorrectly because of attached gdb -# 1. False positives are possible -# 2. We cannot attach another gdb to get stacktraces if some queries hung -gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log & -sleep 5 -# gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s) -time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||: +attach_gdb_to_clickhouse function run_tests() { diff --git a/tests/ci/attach_gdb.lib b/tests/ci/attach_gdb.lib new file mode 100644 index 00000000000..2df6243f796 --- /dev/null +++ b/tests/ci/attach_gdb.lib @@ -0,0 +1,42 @@ +#!/bin/bash + +function attach_gdb_to_clickhouse() +{ + # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog + # and clickhouse-server can do fork-exec, for example, to run some bridge. + # Do not set nostop noprint for all signals, because some it may cause gdb to hang, + # explicitly ignore non-fatal signals that are used by server. + # Number of SIGRTMIN can be determined only in runtime. + RTMIN=$(kill -l SIGRTMIN) + echo " +set follow-fork-mode parent +handle SIGHUP nostop noprint pass +handle SIGINT nostop noprint pass +handle SIGQUIT nostop noprint pass +handle SIGPIPE nostop noprint pass +handle SIGTERM nostop noprint pass +handle SIGUSR1 nostop noprint pass +handle SIGUSR2 nostop noprint pass +handle SIG$RTMIN nostop noprint pass +info signals +continue +backtrace full +thread apply all backtrace full +info registers +disassemble /s +up +disassemble /s +up +disassemble /s +p \"done\" +detach +quit +" > script.gdb + + # FIXME Hung check may work incorrectly because of attached gdb + # We cannot attach another gdb to get stacktraces if some queries hung + gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log & + sleep 5 + # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s) + time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||: +} diff --git a/tests/ci/stress_tests.lib b/tests/ci/stress_tests.lib index 04df50b3248..2b8ac77b952 100644 --- a/tests/ci/stress_tests.lib +++ b/tests/ci/stress_tests.lib @@ -9,6 +9,8 @@ FAIL="\tFAIL\t\\N\t" FAILURE_CONTEXT_LINES=100 FAILURE_CONTEXT_MAX_LINE_WIDTH=300 +source attach_gdb.lib + function escaped() { # That's the simplest way I found to escape a string in bash. Yep, bash is the most convenient programming language. @@ -184,44 +186,7 @@ function start() counter=$((counter + 1)) done - # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog - # and clickhouse-server can do fork-exec, for example, to run some bridge. - # Do not set nostop noprint for all signals, because some it may cause gdb to hang, - # explicitly ignore non-fatal signals that are used by server. - # Number of SIGRTMIN can be determined only in runtime. - RTMIN=$(kill -l SIGRTMIN) - echo " -set follow-fork-mode parent -handle SIGHUP nostop noprint pass -handle SIGINT nostop noprint pass -handle SIGQUIT nostop noprint pass -handle SIGPIPE nostop noprint pass -handle SIGTERM nostop noprint pass -handle SIGUSR1 nostop noprint pass -handle SIGUSR2 nostop noprint pass -handle SIG$RTMIN nostop noprint pass -info signals -continue -backtrace full -thread apply all backtrace full -info registers -disassemble /s -up -disassemble /s -up -disassemble /s -p \"done\" -detach -quit -" > script.gdb - - # FIXME Hung check may work incorrectly because of attached gdb - # 1. False positives are possible - # 2. We cannot attach another gdb to get stacktraces if some queries hung - gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log & - sleep 5 - # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s) - time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||: + attach_gdb_to_clickhouse } function check_server_start() From dedb9067ce695bc8324997484aa627722a64ebbd Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 5 Jun 2023 20:36:17 +0200 Subject: [PATCH 0494/1072] WIP : Azure Table Function, added read and StorageAzureSource --- src/Common/ProfileEvents.cpp | 3 + src/Interpreters/ActionsDAG.cpp | 2 + src/Storages/StorageAzure.cpp | 116 ++++++++++++++++- src/Storages/StorageAzure.h | 39 +++++- src/TableFunctions/CMakeLists.txt | 2 +- src/TableFunctions/ITableFunctionCluster.h | 1 + src/TableFunctions/TableFunctionAzure.cpp | 118 ++++++++++++++++++ src/TableFunctions/TableFunctionAzure.h | 72 +++++++++++ src/TableFunctions/registerTableFunctions.cpp | 6 + src/TableFunctions/registerTableFunctions.h | 4 + 10 files changed, 354 insertions(+), 9 deletions(-) create mode 100644 src/TableFunctions/TableFunctionAzure.cpp create mode 100644 src/TableFunctions/TableFunctionAzure.h diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index fdee9902634..3cc41c1972d 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -348,6 +348,9 @@ The server successfully detected this situation and will download merged part fr M(S3PutObject, "Number of S3 API PutObject calls.") \ M(S3GetObject, "Number of S3 API GetObject calls.") \ \ + M(AzureDeleteObjects, "Number of S3 API DeleteObject(s) calls.") \ + M(AzureListObjects, "Number of S3 API ListObjects calls.") \ + \ M(DiskS3DeleteObjects, "Number of DiskS3 API DeleteObject(s) calls.") \ M(DiskS3CopyObject, "Number of DiskS3 API CopyObject calls.") \ M(DiskS3ListObjects, "Number of DiskS3 API ListObjects calls.") \ diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index cbf6cc1cbe3..94bdca60e69 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -598,6 +598,8 @@ Block ActionsDAG::updateHeader(Block header) const } ColumnsWithTypeAndName result_columns; + + result_columns.reserve(outputs.size()); struct Frame diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index 30fd3fcbe95..683da3a9825 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -32,6 +33,9 @@ #include #include +#include +#include + using namespace Azure::Storage::Blobs; @@ -52,6 +56,8 @@ bool isConnectionString(const std::string & candidate) StorageAzure::Configuration StorageAzure::getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file) { + LOG_INFO(&Poco::Logger::get("StorageAzure"), "get_format_from_file = {}", get_format_from_file); + StorageAzure::Configuration configuration; /// Supported signatures: @@ -74,6 +80,11 @@ StorageAzure::Configuration StorageAzure::getConfiguration(ASTs & engine_args, C configuration.container = checkAndGetLiteralArgument(engine_args[1], "container"); configuration.blob_path = checkAndGetLiteralArgument(engine_args[2], "blobpath"); + LOG_INFO(&Poco::Logger::get("StorageAzure"), "connection_url = {}", configuration.connection_url); + LOG_INFO(&Poco::Logger::get("StorageAzure"), "container = {}", configuration.container); + LOG_INFO(&Poco::Logger::get("StorageAzure"), "blobpath = {}", configuration.blob_path); + + auto is_format_arg = [] (const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().getAllFormats().contains(s); @@ -81,6 +92,7 @@ StorageAzure::Configuration StorageAzure::getConfiguration(ASTs & engine_args, C if (engine_args.size() == 4) { + //'c1 UInt64, c2 UInt64 auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); if (is_format_arg(fourth_arg)) { @@ -143,8 +155,13 @@ StorageAzure::Configuration StorageAzure::getConfiguration(ASTs & engine_args, C configuration.blobs_paths = {configuration.blob_path}; - if (configuration.format == "auto" && get_format_from_file) - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + + LOG_INFO(&Poco::Logger::get("StorageAzure"), "get_format_from_file = {}", get_format_from_file); + +// if (configuration.format == "auto" && get_format_from_file) +// configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + + configuration.format = "TSV"; return configuration; } @@ -215,6 +232,7 @@ AzureClientPtr StorageAzure::createClient(StorageAzure::Configuration configurat if (configuration.is_connection_string) { + LOG_INFO(&Poco::Logger::get("StorageAzure"), "createClient is_connection_string "); result = std::make_unique(BlobContainerClient::CreateFromConnectionString(configuration.connection_url, configuration.container)); } else @@ -228,8 +246,14 @@ AzureClientPtr StorageAzure::createClient(StorageAzure::Configuration configurat auto managed_identity_credential = std::make_shared(); result = std::make_unique(configuration.connection_url, managed_identity_credential); + + LOG_INFO(&Poco::Logger::get("StorageAzure"), "createClient account_name & account_key "); } + + + + return result; } @@ -251,15 +275,13 @@ StorageAzure::StorageAzure( , format_settings(format_settings_) , partition_by(partition_by_) { - FormatFactory::instance().checkFormatName(configuration.format); +// FormatFactory::instance().checkFormatName(configuration.format); context_->getGlobalContext()->getRemoteHostFilter().checkURL(Poco::URI(configuration.getConnectionURL())); StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Schema inference is not supported yet"); - //auto columns = getTableStructureFromDataImpl(configuration, format_settings, context_); - //storage_metadata.setColumns(columns); } else storage_metadata.setColumns(columns_); @@ -268,11 +290,28 @@ StorageAzure::StorageAzure( storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); + StoredObjects objects; + for (const auto & key : configuration.blobs_paths) + objects.emplace_back(key); + + for (auto obj : objects) + { + LOG_INFO(&Poco::Logger::get("StorageAzure"), "constructor obj.remote_paths = {}", obj.remote_path); + if (object_storage->exists(obj)) + { + LOG_INFO(&Poco::Logger::get("StorageAzure"), "constructor exists obj.remote_paths = {}", obj.remote_path); +// auto read_buffer = object_storage->readObject(obj); +// LOG_INFO(&Poco::Logger::get("StorageAzure"), "constructor read size obj.remote_paths = {} , size = {}", obj.remote_path, read_buffer->getFileSize()); + } + } + + auto default_virtuals = NamesAndTypesList{ {"_path", std::make_shared(std::make_shared())}, {"_file", std::make_shared(std::make_shared())}}; auto columns = storage_metadata.getSampleBlock().getNamesAndTypesList(); + virtual_columns = getVirtualsForStorage(columns, default_virtuals); for (const auto & column : virtual_columns) virtual_block.insert({column.type->createColumn(), column.type, column.name}); @@ -435,6 +474,35 @@ private: } + +Pipe StorageAzure::read( + const Names & /*column_names*/ , + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & /*query_info*/, + ContextPtr context, + QueryProcessingStage::Enum /*processed_stage*/, + size_t max_block_size, + size_t /*num_streams*/) +{ + Pipes pipes; + + StoredObjects objects; + for (const auto & key : configuration.blobs_paths) + objects.emplace_back(key); + + auto reader = object_storage->readObjects(objects); + auto block_for_format = storage_snapshot->metadata->getSampleBlock(); + + for (auto col : block_for_format.getColumns()) + LOG_INFO(&Poco::Logger::get("StorageAzure"), "read col = {}",col->getName()); + + + pipes.emplace_back(std::make_shared(std::move(reader), context, block_for_format, max_block_size)); + + + return Pipe::unitePipes(std::move(pipes)); +} + SinkToStoragePtr StorageAzure::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) { auto sample_block = metadata_snapshot->getSampleBlock(); @@ -513,6 +581,44 @@ bool StorageAzure::supportsPartitionBy() const return true; } + +StorageAzureSource::StorageAzureSource (std::unique_ptr && read_buffer_, ContextPtr context_, + const Block & sample_block_,UInt64 max_block_size_) + :ISource(Block()) + , WithContext(context_) + , read_buffer(std::move(read_buffer_)) + , sample_block(sample_block_) + , max_block_size(max_block_size_) +{ + auto format = "TSV"; + + auto input_format = FormatFactory::instance().getInput( + format, *read_buffer, sample_block, getContext(), max_block_size); + + QueryPipelineBuilder builder; + builder.init(Pipe(input_format)); + + pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); + reader = std::make_unique(*pipeline); +} + + +Chunk StorageAzureSource::generate() +{ + Chunk chunk; + if (reader->pull(chunk)) + { + LOG_INFO(&Poco::Logger::get("StorageAzureSource"), "pulled chunk rows = {}", chunk.getNumRows()); + + } + return chunk; +} + +String StorageAzureSource::getName() const +{ + return "StorageAzureSource"; +} + } #endif diff --git a/src/Storages/StorageAzure.h b/src/Storages/StorageAzure.h index b99df2e89a5..b93501ce2f2 100644 --- a/src/Storages/StorageAzure.h +++ b/src/Storages/StorageAzure.h @@ -48,10 +48,9 @@ public: std::string getConnectionURL() const { - if (!is_connection_string) +// if (!is_connection_string) return connection_url; - - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Connection string not implemented yet"); + //throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Connection string not implemented yet"); } std::string connection_url; @@ -78,6 +77,11 @@ public: static StorageAzure::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file = true); static AzureClientPtr createClient(StorageAzure::Configuration configuration); + static AzureObjectStorage::SettingsPtr createSettings(StorageAzure::Configuration configuration); + static ColumnsDescription getTableStructureFromData( + const StorageAzure::Configuration & configuration, + const std::optional & format_settings, + ContextPtr ctx); String getName() const override { @@ -114,6 +118,35 @@ private: std::optional format_settings; ASTPtr partition_by; + static ColumnsDescription getTableStructureFromDataImpl( + const Configuration & configuration, + const std::optional & format_settings, + ContextPtr ctx); + +}; + +class StorageAzureSource : public ISource, WithContext +{ +public: + StorageAzureSource (std::unique_ptr && read_buffer_, ContextPtr context_, const Block & sample_block_, UInt64 max_block_size_); + ~StorageAzureSource() override {} + + Chunk generate() override; + String getName() const override; + + +private: +// std::unique_ptr read_buffer; + + String path; + std::unique_ptr read_buffer; +// std::unique_ptr read_buf; + std::unique_ptr pipeline; + std::unique_ptr reader; + Block sample_block; + UInt64 max_block_size; + +// void createReader(); }; } diff --git a/src/TableFunctions/CMakeLists.txt b/src/TableFunctions/CMakeLists.txt index b1fa61a72ee..3544c5bf8b4 100644 --- a/src/TableFunctions/CMakeLists.txt +++ b/src/TableFunctions/CMakeLists.txt @@ -17,5 +17,5 @@ add_library(clickhouse_table_functions ${clickhouse_table_functions_sources}) target_link_libraries(clickhouse_table_functions PRIVATE clickhouse_parsers clickhouse_storages_system dbms) if (TARGET ch_contrib::hivemetastore) - target_link_libraries(clickhouse_table_functions PRIVATE ch_contrib::hivemetastore ch_contrib::hdfs ch_contrib::parquet) + target_link_libraries(clickhouse_table_functions PRIVATE ch_contrib::hivemetastore ch_contrib::hdfs ch_contrib::parquet ch_contrib::azure_sdk) endif () diff --git a/src/TableFunctions/ITableFunctionCluster.h b/src/TableFunctions/ITableFunctionCluster.h index ad88d7b54f0..f68558596ca 100644 --- a/src/TableFunctions/ITableFunctionCluster.h +++ b/src/TableFunctions/ITableFunctionCluster.h @@ -4,6 +4,7 @@ #include #include +#include #include #include #include diff --git a/src/TableFunctions/TableFunctionAzure.cpp b/src/TableFunctions/TableFunctionAzure.cpp new file mode 100644 index 00000000000..eb67ed9a983 --- /dev/null +++ b/src/TableFunctions/TableFunctionAzure.cpp @@ -0,0 +1,118 @@ +#include "config.h" + +#if USE_AZURE_BLOB_STORAGE + +//#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "registerTableFunctions.h" +#include +#include +#include + +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int LOGICAL_ERROR; +} + + +void TableFunctionAzure::parseArgumentsImpl(ASTs & args, const ContextPtr & context) +{ + if (args.size() != 5) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "The signature of table function {} shall be the following:\n{}", getName(), getSignature()); + + for (auto & arg : args) + arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); + + configuration.connection_url = checkAndGetLiteralArgument(args[0], "connection_url"); + configuration.container = checkAndGetLiteralArgument(args[1], "container"); + configuration.blob_path = checkAndGetLiteralArgument(args[2], "blob_path"); + configuration.format = checkAndGetLiteralArgument(args[3], "format"); + configuration.structure = checkAndGetLiteralArgument(args[4], "structure"); +} + +void TableFunctionAzure::parseArguments(const ASTPtr & ast_function, ContextPtr context) +{ + LOG_INFO(&Poco::Logger::get("TableFunctionAzure"), "parseArguments = {}", ast_function->dumpTree()); + + ASTs & args_func = ast_function->children; + + if (args_func.size() != 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' must have arguments.", getName()); + + auto & args = args_func.at(0)->children; + + parseArgumentsImpl(args, context); +} + +ColumnsDescription TableFunctionAzure::getActualTableStructure(ContextPtr context) const +{ + return parseColumnsListFromString(configuration.structure, context); +} + +bool TableFunctionAzure::supportsReadingSubsetOfColumns() +{ + return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format); +} + +StoragePtr TableFunctionAzure::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +{ + LOG_INFO(&Poco::Logger::get("TableFunctionAzure"), "executeImpl = {}", table_name); + + ColumnsDescription columns; + columns = parseColumnsListFromString(configuration.structure, context); + + configuration.is_connection_string = true; + configuration.blobs_paths = {configuration.blob_path}; + + auto client = StorageAzure::createClient(configuration); + + StoragePtr storage = std::make_shared( + configuration, + std::make_unique(table_name, std::move(client), std::make_unique()), + context, + StorageID(getDatabaseName(), table_name), + columns, + ConstraintsDescription{}, + String{}, + /// No format_settings for table function Azure + std::nullopt, nullptr); + + storage->startup(); + + return storage; +} + +void registerTableFunctionAzure(TableFunctionFactory & factory) +{ + factory.registerFunction( + {.documentation + = {.description=R"(The table function can be used to read the data stored on Azure Blob Storage.)", + .examples{{"azure_blob", "SELECT * FROM azure_blob(connection, container, blob_path, format, structure)", ""}}}, + .allow_readonly = false}); +} + +} + +#endif diff --git a/src/TableFunctions/TableFunctionAzure.h b/src/TableFunctions/TableFunctionAzure.h new file mode 100644 index 00000000000..a6fb5415113 --- /dev/null +++ b/src/TableFunctions/TableFunctionAzure.h @@ -0,0 +1,72 @@ +#pragma once + +#include "config.h" + +#if USE_AZURE_BLOB_STORAGE + +#include +#include + + +namespace DB +{ + +class Context; + +/* AzureBlob(source, [access_key_id, secret_access_key,] [format, structure, compression]) - creates a temporary storage for a file in AzureBlob. + */ +class TableFunctionAzure : public ITableFunction +{ +public: + static constexpr auto name = "azure_blob"; + static constexpr auto signature = "- connection_url, container, blob, format, structure\n"; + + static size_t getMaxNumberOfArguments() { return 5; } + + String getName() const override + { + return name; + } + + virtual String getSignature() const + { + return signature; + } + + bool hasStaticStructure() const override { return configuration.structure != "auto"; } + + bool needStructureHint() const override { return configuration.structure == "auto"; } + + void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; } + + bool supportsReadingSubsetOfColumns() override; + + std::unordered_set getVirtualsToCheckBeforeUsingStructureHint() const override + { + return {"_path", "_file"}; + } + + virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context); + + static void addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context); + +protected: + + StoragePtr executeImpl( + const ASTPtr & ast_function, + ContextPtr context, + const std::string & table_name, + ColumnsDescription cached_columns) const override; + + const char * getStorageTypeName() const override { return "Azure"; } + + ColumnsDescription getActualTableStructure(ContextPtr context) const override; + void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; + + mutable StorageAzure::Configuration configuration; + ColumnsDescription structure_hint; +}; + +} + +#endif diff --git a/src/TableFunctions/registerTableFunctions.cpp b/src/TableFunctions/registerTableFunctions.cpp index 4f3411df4c5..e0114368e44 100644 --- a/src/TableFunctions/registerTableFunctions.cpp +++ b/src/TableFunctions/registerTableFunctions.cpp @@ -71,6 +71,12 @@ void registerTableFunctions() registerTableFunctionFormat(factory); registerTableFunctionExplain(factory); + +#if USE_AZURE_BLOB_STORAGE + registerTableFunctionAzure(factory); +#endif + + } } diff --git a/src/TableFunctions/registerTableFunctions.h b/src/TableFunctions/registerTableFunctions.h index c51522a5e99..fa4fec2b03a 100644 --- a/src/TableFunctions/registerTableFunctions.h +++ b/src/TableFunctions/registerTableFunctions.h @@ -69,6 +69,10 @@ void registerTableFunctionFormat(TableFunctionFactory & factory); void registerTableFunctionExplain(TableFunctionFactory & factory); +#if USE_AZURE_BLOB_STORAGE +void registerTableFunctionAzure(TableFunctionFactory & factory); +#endif + void registerTableFunctions(); } From 59095c445d7a48ae12ac48c8748d4b48699a1274 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Mon, 5 Jun 2023 20:44:20 +0200 Subject: [PATCH 0495/1072] Revert "Revert "make filter push down through cross join"" --- .../Optimizations/filterPushDown.cpp | 6 +++--- .../01763_filter_push_down_bugs.reference | 19 +++++++++++++++++++ .../01763_filter_push_down_bugs.sql | 19 +++++++++++++++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index 37bc894339f..db29038999b 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -272,7 +272,7 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes { /// If totals step has HAVING expression, skip it for now. /// TODO: - /// We can merge HAVING expression with current filer. + /// We can merge HAVING expression with current filter. /// Also, we can push down part of HAVING which depend only on aggregation keys. if (totals_having->getActions()) return 0; @@ -323,9 +323,9 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes { const auto & table_join = join ? join->getJoin()->getTableJoin() : filled_join->getJoin()->getTableJoin(); - /// Only inner and left(/right) join are supported. Other types may generate default values for left table keys. + /// Only inner, cross and left(/right) join are supported. Other types may generate default values for left table keys. /// So, if we push down a condition like `key != 0`, not all rows may be filtered. - if (table_join.kind() != JoinKind::Inner && table_join.kind() != kind) + if (table_join.kind() != JoinKind::Inner && table_join.kind() != JoinKind::Cross && table_join.kind() != kind) return 0; bool is_left = kind == JoinKind::Left; diff --git a/tests/queries/0_stateless/01763_filter_push_down_bugs.reference b/tests/queries/0_stateless/01763_filter_push_down_bugs.reference index 5aa2e645509..7df35e2948d 100644 --- a/tests/queries/0_stateless/01763_filter_push_down_bugs.reference +++ b/tests/queries/0_stateless/01763_filter_push_down_bugs.reference @@ -6,3 +6,22 @@ String1_0 String2_0 String3_0 String4_0 1 String1_0 String2_0 String3_0 String4_0 1 1 [0,1,2] 1 +Expression ((Projection + Before ORDER BY)) + Filter (WHERE) + Join (JOIN FillRightFirst) + Filter (( + Before JOIN)) + ReadFromMergeTree (default.t1) + Indexes: + PrimaryKey + Keys: + id + Condition: (id in [101, 101]) + Parts: 1/1 + Granules: 1/1 + Expression ((Joined actions + (Rename joined columns + (Projection + Before ORDER BY)))) + ReadFromMergeTree (default.t2) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 diff --git a/tests/queries/0_stateless/01763_filter_push_down_bugs.sql b/tests/queries/0_stateless/01763_filter_push_down_bugs.sql index 1058bf75144..2ee249b5ce7 100644 --- a/tests/queries/0_stateless/01763_filter_push_down_bugs.sql +++ b/tests/queries/0_stateless/01763_filter_push_down_bugs.sql @@ -38,6 +38,25 @@ DROP TABLE IF EXISTS Test; select x, y from (select [0, 1, 2] as y, 1 as a, 2 as b) array join y as x where a = 1 and b = 2 and (x = 1 or x != 1) and x = 1; +DROP TABLE IF EXISTS t; create table t(a UInt8) engine=MergeTree order by a; insert into t select * from numbers(2); select a from t t1 join t t2 on t1.a = t2.a where t1.a; +DROP TABLE IF EXISTS t; + +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; +CREATE TABLE t1 (id Int64, create_time DateTime) ENGINE = MergeTree ORDER BY id; +CREATE TABLE t2 (delete_time DateTime) ENGINE = MergeTree ORDER BY delete_time; + +insert into t1 values (101, '2023-05-28 00:00:00'), (102, '2023-05-28 00:00:00'); +insert into t2 values ('2023-05-31 00:00:00'); + +EXPLAIN indexes=1 SELECT id, delete_time FROM t1 + CROSS JOIN ( + SELECT delete_time + FROM t2 +) AS d WHERE create_time < delete_time AND id = 101; + +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; From 4da37a731962ab452ef6a7fb75025f0a6f4e2a51 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Mon, 5 Jun 2023 16:19:50 -0300 Subject: [PATCH 0496/1072] Update argmax.md --- .../aggregate-functions/reference/argmax.md | 60 ++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmax.md b/docs/en/sql-reference/aggregate-functions/reference/argmax.md index 65c43ab04c0..9aaa35dc6d8 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmax.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmax.md @@ -5,7 +5,8 @@ sidebar_position: 106 # argMax -Calculates the `arg` value for a maximum `val` value. If there are several different values of `arg` for maximum values of `val`, returns the first of these values encountered. +Calculates the `arg` value for a maximum `val` value. If there are several different values of `arg` for maximum values of `val`, returns the first of these values encountered. +Both parts the `arg` and the `max` behave as aggregate functions, they skip `Null` during processing and return not-Null values if not-Null values are available. **Syntax** @@ -49,3 +50,60 @@ Result: │ director │ └──────────────────────┘ ``` + +**Extended example** + +```sql +CREATE TABLE test +( + a Nullable(String), + b Nullable(Int64) +) +ENGINE = Memory AS +SELECT * +FROM values(('a', 1), ('b', 2), ('c', 2), (NULL, 3), (NULL, NULL), ('d', NULL)); + +select * from test; +┌─a────┬────b─┐ +│ a │ 1 │ +│ b │ 2 │ +│ c │ 2 │ +│ ᴺᵁᴸᴸ │ 3 │ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ +│ d │ ᴺᵁᴸᴸ │ +└──────┴──────┘ + +select argMax(a, b), max(b) from test; +┌─argMax(a, b)─┬─max(b)─┐ +│ b │ 3 │ -- argMax = b because it the first not-Null value, max(b) is from another row! +└──────────────┴────────┘ + +select argMax(tuple(a), b) from test; +┌─argMax(tuple(a), b)─┐ +│ (NULL) │ -- Tuple allows to get Null value. +└─────────────────────┘ + +select (argMax((a, b), b) as t).1 argMaxA, t.2 argMaxB from test; +┌─argMaxA─┬─argMaxB─┐ +│ ᴺᵁᴸᴸ │ 3 │ -- you can use Tuple and get both (all - tuple(*) ) columns for the according max(b) +└─────────┴─────────┘ + +select argMax(a, b), max(b) from test where a is Null and b is Null; +┌─argMax(a, b)─┬─max(b)─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -- Nulls are not skipped because only Null values are available +└──────────────┴────────┘ + +select argMax(a, (b,a)) from test; +┌─argMax(a, tuple(b, a))─┐ +│ c │ -- There are two rows with b=2, Tuple in the `Max` allows to get not the first `arg` +└────────────────────────┘ + +select argMax(a, tuple(b)) from test; +┌─argMax(a, tuple(b))─┐ +│ b │ -- Tuple can be used `Max` to not skip Nulls in `Max` +└─────────────────────┘ +``` + +**See also** + +- [Tuple](../../sql-reference/data-types/tuple.md) From 6741a6d7c81ab9a17042170c182327295a6de356 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Mon, 5 Jun 2023 16:21:09 -0300 Subject: [PATCH 0497/1072] Update argmax.md --- docs/en/sql-reference/aggregate-functions/reference/argmax.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmax.md b/docs/en/sql-reference/aggregate-functions/reference/argmax.md index 9aaa35dc6d8..7800e90eec7 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmax.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmax.md @@ -106,4 +106,4 @@ select argMax(a, tuple(b)) from test; **See also** -- [Tuple](../../sql-reference/data-types/tuple.md) +- [Tuple](../../data-types/tuple.md) From 113ce8c7574f3e8348cf3ba7981e9a837460ff8b Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Mon, 5 Jun 2023 16:43:07 -0300 Subject: [PATCH 0498/1072] Update argmin.md --- .../aggregate-functions/reference/argmin.md | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmin.md b/docs/en/sql-reference/aggregate-functions/reference/argmin.md index a7c21e3f15b..7972bdf84b8 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmin.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmin.md @@ -6,6 +6,7 @@ sidebar_position: 105 # argMin Calculates the `arg` value for a minimum `val` value. If there are several different values of `arg` for minimum values of `val`, returns the first of these values encountered. +Both parts the `arg` and the `min` behave as [aggregate functions](../aggregate-functions/index.md), they both [skip `Null`](../aggregate-functions/index.md#null-processing) during processing and return not-Null values if not-Null values are available. **Syntax** @@ -49,3 +50,65 @@ Result: │ worker │ └──────────────────────┘ ``` + +**Extended example** + +```sql +CREATE TABLE test +( + a Nullable(String), + b Nullable(Int64) +) +ENGINE = Memory AS +SELECT * +FROM values((NULL, 0), ('a', 1), ('b', 2), ('c', 2), (NULL, NULL), ('d', NULL)); + +select * from test; +┌─a────┬────b─┐ +│ ᴺᵁᴸᴸ │ 0 │ +│ a │ 1 │ +│ b │ 2 │ +│ c │ 2 │ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ +│ d │ ᴺᵁᴸᴸ │ +└──────┴──────┘ + +select argMin(a, b), min(b) from test; +┌─argMin(a, b)─┬─min(b)─┐ +│ a │ 0 │ -- argMin = a because it the first not-Null value, min(b) is from another row! +└──────────────┴────────┘ + +select argMin(tuple(a), b) from test; +┌─argMin(tuple(a), b)─┐ +│ (NULL) │ -- Tuple allows to get Null value. +└─────────────────────┘ + +select (argMin((a, b), b) as t).1 argMinA, t.2 argMinB from test; +┌─argMinA─┬─argMinB─┐ +│ ᴺᵁᴸᴸ │ 0 │ -- you can use Tuple and get both (all - tuple(*) ) columns for the according max(b) +└─────────┴─────────┘ + +select argMin(a, b), min(b) from test where a is Null and b is Null; +┌─argMin(a, b)─┬─min(b)─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -- Nulls are not skipped because only Null values are available +└──────────────┴────────┘ + +select argMin(a, (b, a)), min(tuple(b, a)) from test; +┌─argMin(a, tuple(b, a))─┬─min(tuple(b, a))─┐ +│ d │ (NULL,NULL) │ 'd' is the first Not null value for the min +└────────────────────────┴──────────────────┘ + +select argMin((a, b), (b, a)), min(tuple(b, a)) from test; +┌─argMin(tuple(a, b), tuple(b, a))─┬─min(tuple(b, a))─┐ +│ (NULL,NULL) │ (NULL,NULL) │ +└──────────────────────────────────┴──────────────────┘ + +select argMin(a, tuple(b)) from test; +┌─argMax(a, tuple(b))─┐ +│ b │ -- Tuple can be used in `Min` to not skip Nulls in `Min` +└─────────────────────┘ +``` + +**See also** + +- [Tuple](../../data-types/tuple.md) From 40986539775f5d74659521000fe4aa05ca47b06b Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Mon, 5 Jun 2023 16:43:47 -0300 Subject: [PATCH 0499/1072] Update argmin.md --- docs/en/sql-reference/aggregate-functions/reference/argmin.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmin.md b/docs/en/sql-reference/aggregate-functions/reference/argmin.md index 7972bdf84b8..a481157784b 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmin.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmin.md @@ -6,7 +6,7 @@ sidebar_position: 105 # argMin Calculates the `arg` value for a minimum `val` value. If there are several different values of `arg` for minimum values of `val`, returns the first of these values encountered. -Both parts the `arg` and the `min` behave as [aggregate functions](../aggregate-functions/index.md), they both [skip `Null`](../aggregate-functions/index.md#null-processing) during processing and return not-Null values if not-Null values are available. +Both parts the `arg` and the `min` behave as [aggregate functions](../index.md), they both [skip `Null`](../index.md#null-processing) during processing and return not-Null values if not-Null values are available. **Syntax** From a175e4628e640376bc49ceabcebb278159e6d55f Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Mon, 5 Jun 2023 16:44:42 -0300 Subject: [PATCH 0500/1072] Update argmax.md --- docs/en/sql-reference/aggregate-functions/reference/argmax.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmax.md b/docs/en/sql-reference/aggregate-functions/reference/argmax.md index 7800e90eec7..a736804c8dc 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmax.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmax.md @@ -6,7 +6,7 @@ sidebar_position: 106 # argMax Calculates the `arg` value for a maximum `val` value. If there are several different values of `arg` for maximum values of `val`, returns the first of these values encountered. -Both parts the `arg` and the `max` behave as aggregate functions, they skip `Null` during processing and return not-Null values if not-Null values are available. +Both parts the `arg` and the `max` behave as [aggregate functions](../index.md), they both [skip `Null`](../index.md#null-processing) during processing and return not-Null values if not-Null values are available. **Syntax** @@ -100,7 +100,7 @@ select argMax(a, (b,a)) from test; select argMax(a, tuple(b)) from test; ┌─argMax(a, tuple(b))─┐ -│ b │ -- Tuple can be used `Max` to not skip Nulls in `Max` +│ b │ -- Tuple can be used in `Max` to not skip Nulls in `Max` └─────────────────────┘ ``` From 3e444790af079c3d486ac943b29d4900f4d0576f Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Mon, 5 Jun 2023 16:52:51 -0300 Subject: [PATCH 0501/1072] Update index.md --- docs/en/sql-reference/aggregate-functions/index.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/en/sql-reference/aggregate-functions/index.md b/docs/en/sql-reference/aggregate-functions/index.md index 8951ac4ee6a..019e1cab873 100644 --- a/docs/en/sql-reference/aggregate-functions/index.md +++ b/docs/en/sql-reference/aggregate-functions/index.md @@ -72,3 +72,15 @@ FROM t_null_big │ 2.3333333333333335 │ 1.4 │ └────────────────────┴─────────────────────┘ ``` + +Also you can use [Tuple](../data-types/tuple.md) to change NULL skipping behavior. + +```sql +select groupArray(b), groupArray(tuple(b)) from t_null_big; +┌─groupArray(b)─┬─groupArray(tuple(b))────────┐ +│ [2,2,3] │ [(2),(NULL),(2),(3),(NULL)] │ +└───────────────┴─────────────────────────────┘ +``` + + + From b4c0d68d0f7a17702d074a9a2f216bbda524c94d Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Mon, 5 Jun 2023 16:53:37 -0300 Subject: [PATCH 0502/1072] Update index.md --- docs/en/sql-reference/aggregate-functions/index.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/index.md b/docs/en/sql-reference/aggregate-functions/index.md index 019e1cab873..a3808335168 100644 --- a/docs/en/sql-reference/aggregate-functions/index.md +++ b/docs/en/sql-reference/aggregate-functions/index.md @@ -76,10 +76,11 @@ FROM t_null_big Also you can use [Tuple](../data-types/tuple.md) to change NULL skipping behavior. ```sql -select groupArray(b), groupArray(tuple(b)) from t_null_big; -┌─groupArray(b)─┬─groupArray(tuple(b))────────┐ -│ [2,2,3] │ [(2),(NULL),(2),(3),(NULL)] │ -└───────────────┴─────────────────────────────┘ +select groupArray(b), groupArray(tuple(b)).1 from t_null_big; + +┌─groupArray(b)─┬─tupleElement(groupArray(tuple(b)), 1)─┐ +│ [2,2,3] │ [2,NULL,2,3,NULL] │ +└───────────────┴───────────────────────────────────────┘ ``` From e37cd36db7316c5d2a90df8ca8d8bee8fa016e4a Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Mon, 5 Jun 2023 16:57:28 -0300 Subject: [PATCH 0503/1072] Update argmin.md --- docs/en/sql-reference/aggregate-functions/reference/argmin.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmin.md b/docs/en/sql-reference/aggregate-functions/reference/argmin.md index a481157784b..067c81f56cf 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmin.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmin.md @@ -85,7 +85,7 @@ select argMin(tuple(a), b) from test; select (argMin((a, b), b) as t).1 argMinA, t.2 argMinB from test; ┌─argMinA─┬─argMinB─┐ -│ ᴺᵁᴸᴸ │ 0 │ -- you can use Tuple and get both (all - tuple(*) ) columns for the according max(b) +│ ᴺᵁᴸᴸ │ 0 │ -- you can use Tuple and get both (all - tuple(*)) columns for the according max(b) └─────────┴─────────┘ select argMin(a, b), min(b) from test where a is Null and b is Null; @@ -95,7 +95,7 @@ select argMin(a, b), min(b) from test where a is Null and b is Null; select argMin(a, (b, a)), min(tuple(b, a)) from test; ┌─argMin(a, tuple(b, a))─┬─min(tuple(b, a))─┐ -│ d │ (NULL,NULL) │ 'd' is the first Not null value for the min +│ d │ (NULL,NULL) │ -- 'd' is the first Not null value for the min └────────────────────────┴──────────────────┘ select argMin((a, b), (b, a)), min(tuple(b, a)) from test; From 15fcad190933aa7b885bdeacb9b2f277104433ac Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Mon, 5 Jun 2023 16:57:42 -0300 Subject: [PATCH 0504/1072] Update argmax.md --- docs/en/sql-reference/aggregate-functions/reference/argmax.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmax.md b/docs/en/sql-reference/aggregate-functions/reference/argmax.md index a736804c8dc..8a84f361589 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmax.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmax.md @@ -85,7 +85,7 @@ select argMax(tuple(a), b) from test; select (argMax((a, b), b) as t).1 argMaxA, t.2 argMaxB from test; ┌─argMaxA─┬─argMaxB─┐ -│ ᴺᵁᴸᴸ │ 3 │ -- you can use Tuple and get both (all - tuple(*) ) columns for the according max(b) +│ ᴺᵁᴸᴸ │ 3 │ -- you can use Tuple and get both (all - tuple(*)) columns for the according max(b) └─────────┴─────────┘ select argMax(a, b), max(b) from test where a is Null and b is Null; From 3023eb73d02fe7825626a1b5767827432f1513c1 Mon Sep 17 00:00:00 2001 From: Misz606 <113922942+Misz606@users.noreply.github.com> Date: Mon, 5 Jun 2023 21:02:27 +0100 Subject: [PATCH 0505/1072] Update aggregatingmergetree.md Grammatical update in docs --- .../table-engines/mergetree-family/aggregatingmergetree.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md b/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md index 2b8b43802ea..62191d9b5e4 100644 --- a/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md @@ -109,7 +109,7 @@ INSERT INTO test.visits (StartDate, CounterID, Sign, UserID) VALUES (1667446031, 1, 6, 3) ``` -The data are inserted in both the table and the materialized view `test.mv_visits`. +The data is inserted in both the table and the materialized view `test.mv_visits`. To get the aggregated data, we need to execute a query such as `SELECT ... GROUP BY ...` from the materialized view `test.mv_visits`: From fd39616e780ac216fd07aa2625119a65c85661ee Mon Sep 17 00:00:00 2001 From: Han Fei Date: Mon, 5 Jun 2023 22:09:32 +0200 Subject: [PATCH 0506/1072] suppress some tests for analyzer --- tests/queries/0_stateless/01479_cross_join_9855.sql | 4 ++-- tests/queries/0_stateless/01763_filter_push_down_bugs.sql | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/01479_cross_join_9855.sql b/tests/queries/0_stateless/01479_cross_join_9855.sql index 6dc76f22057..9dcf209a1cd 100644 --- a/tests/queries/0_stateless/01479_cross_join_9855.sql +++ b/tests/queries/0_stateless/01479_cross_join_9855.sql @@ -2,8 +2,8 @@ SET cross_to_inner_join_rewrite = 1; SELECT count() FROM numbers(4) AS n1, numbers(3) AS n2 -WHERE n1.number > (select avg(n.number) from numbers(3) n); +WHERE n1.number > (select avg(n.number) from numbers(3) n) SETTINGS allow_experimental_analyzer=0; SELECT count() FROM numbers(4) AS n1, numbers(3) AS n2, numbers(6) AS n3 -WHERE n1.number > (select avg(n.number) from numbers(3) n); +WHERE n1.number > (select avg(n.number) from numbers(3) n) SETTINGS allow_experimental_analyzer=0; diff --git a/tests/queries/0_stateless/01763_filter_push_down_bugs.sql b/tests/queries/0_stateless/01763_filter_push_down_bugs.sql index 2ee249b5ce7..5f7f4379714 100644 --- a/tests/queries/0_stateless/01763_filter_push_down_bugs.sql +++ b/tests/queries/0_stateless/01763_filter_push_down_bugs.sql @@ -56,7 +56,7 @@ EXPLAIN indexes=1 SELECT id, delete_time FROM t1 CROSS JOIN ( SELECT delete_time FROM t2 -) AS d WHERE create_time < delete_time AND id = 101; +) AS d WHERE create_time < delete_time AND id = 101 SETTINGS allow_experimental_analyzer=0; DROP TABLE IF EXISTS t1; DROP TABLE IF EXISTS t2; From aa6f4e43c5ea0fa4d6c84f121eba766d89a6efdf Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 5 Jun 2023 23:15:13 +0200 Subject: [PATCH 0507/1072] Fixed COLUMN_NOT_FOUND in block issue --- src/Storages/StorageAzure.cpp | 35 ++++++++++++++--------- src/Storages/StorageAzure.h | 3 +- src/Storages/StorageS3.cpp | 3 ++ src/TableFunctions/TableFunctionAzure.cpp | 5 ++-- src/TableFunctions/TableFunctionS3.cpp | 5 ++++ 5 files changed, 34 insertions(+), 17 deletions(-) diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index 683da3a9825..76e9130bda3 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -476,7 +476,7 @@ private: Pipe StorageAzure::read( - const Names & /*column_names*/ , + const Names & column_names , const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, @@ -491,13 +491,11 @@ Pipe StorageAzure::read( objects.emplace_back(key); auto reader = object_storage->readObjects(objects); - auto block_for_format = storage_snapshot->metadata->getSampleBlock(); - - for (auto col : block_for_format.getColumns()) - LOG_INFO(&Poco::Logger::get("StorageAzure"), "read col = {}",col->getName()); + auto columns_description = storage_snapshot->getDescriptionForColumns(column_names); + auto block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); - pipes.emplace_back(std::make_shared(std::move(reader), context, block_for_format, max_block_size)); + pipes.emplace_back(std::make_shared(std::move(reader), context, block_for_format, max_block_size, columns_description)); return Pipe::unitePipes(std::move(pipes)); @@ -583,12 +581,13 @@ bool StorageAzure::supportsPartitionBy() const StorageAzureSource::StorageAzureSource (std::unique_ptr && read_buffer_, ContextPtr context_, - const Block & sample_block_,UInt64 max_block_size_) - :ISource(Block()) + const Block & sample_block_,UInt64 max_block_size_, const ColumnsDescription & columns_) + :ISource(sample_block_) , WithContext(context_) , read_buffer(std::move(read_buffer_)) , sample_block(sample_block_) , max_block_size(max_block_size_) + , columns_desc(columns_) { auto format = "TSV"; @@ -598,6 +597,13 @@ StorageAzureSource::StorageAzureSource (std::unique_ptr QueryPipelineBuilder builder; builder.init(Pipe(input_format)); + if (columns_desc.hasDefaults()) + { + builder.addSimpleTransform( + [&](const Block & header) + { return std::make_shared(header, columns_desc, *input_format, getContext()); }); + } + pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); reader = std::make_unique(*pipeline); } @@ -605,13 +611,16 @@ StorageAzureSource::StorageAzureSource (std::unique_ptr Chunk StorageAzureSource::generate() { - Chunk chunk; - if (reader->pull(chunk)) + while(true) { - LOG_INFO(&Poco::Logger::get("StorageAzureSource"), "pulled chunk rows = {}", chunk.getNumRows()); - + Chunk chunk; + if (reader->pull(chunk)) + { + LOG_INFO(&Poco::Logger::get("StorageAzureSource"), "pulled chunk rows = {}", chunk.getNumRows()); + } + return chunk; } - return chunk; +// return {}; } String StorageAzureSource::getName() const diff --git a/src/Storages/StorageAzure.h b/src/Storages/StorageAzure.h index b93501ce2f2..61237fcc2f9 100644 --- a/src/Storages/StorageAzure.h +++ b/src/Storages/StorageAzure.h @@ -128,7 +128,7 @@ private: class StorageAzureSource : public ISource, WithContext { public: - StorageAzureSource (std::unique_ptr && read_buffer_, ContextPtr context_, const Block & sample_block_, UInt64 max_block_size_); + StorageAzureSource (std::unique_ptr && read_buffer_, ContextPtr context_, const Block & sample_block_, UInt64 max_block_size_, const ColumnsDescription & columns_); ~StorageAzureSource() override {} Chunk generate() override; @@ -145,6 +145,7 @@ private: std::unique_ptr reader; Block sample_block; UInt64 max_block_size; + ColumnsDescription columns_desc; // void createReader(); }; diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 2d8aaec0f07..7d6254b2551 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -957,6 +957,9 @@ StorageS3::StorageS3( {"_file", std::make_shared(std::make_shared())}}; auto columns = storage_metadata.getSampleBlock().getNamesAndTypesList(); + + LOG_INFO(&Poco::Logger::get("StorageS3"), "constructor columns = {}", columns.toString()); + virtual_columns = getVirtualsForStorage(columns, default_virtuals); for (const auto & column : virtual_columns) virtual_block.insert({column.type->createColumn(), column.type, column.name}); diff --git a/src/TableFunctions/TableFunctionAzure.cpp b/src/TableFunctions/TableFunctionAzure.cpp index eb67ed9a983..f565a365a13 100644 --- a/src/TableFunctions/TableFunctionAzure.cpp +++ b/src/TableFunctions/TableFunctionAzure.cpp @@ -54,7 +54,8 @@ void TableFunctionAzure::parseArgumentsImpl(ASTs & args, const ContextPtr & cont void TableFunctionAzure::parseArguments(const ASTPtr & ast_function, ContextPtr context) { - LOG_INFO(&Poco::Logger::get("TableFunctionAzure"), "parseArguments = {}", ast_function->dumpTree()); + /// Clone ast function, because we can modify its arguments like removing headers. + auto ast_copy = ast_function->clone(); ASTs & args_func = ast_function->children; @@ -78,8 +79,6 @@ bool TableFunctionAzure::supportsReadingSubsetOfColumns() StoragePtr TableFunctionAzure::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const { - LOG_INFO(&Poco::Logger::get("TableFunctionAzure"), "executeImpl = {}", table_name); - ColumnsDescription columns; columns = parseColumnsListFromString(configuration.structure, context); diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index c8cc0cddd30..7f283afd6b4 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -294,6 +294,8 @@ void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context) const { + LOG_INFO(&Poco::Logger::get("TableFunctionS3"), "getActualTableStructure configuration.structure = {} ",configuration.structure); + if (configuration.structure == "auto") { context->checkAccess(getSourceAccessType()); @@ -319,6 +321,9 @@ StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, Context else if (!structure_hint.empty()) columns = structure_hint; + LOG_INFO(&Poco::Logger::get("TableFunctionS3"), "executeImpl structre = {} structure_hint = {} ",configuration.structure, structure_hint.getAll().toString()); + + StoragePtr storage = std::make_shared( configuration, context, From bd6b0ff1c005aaa8f976103762e6848e3b3448d8 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 5 Jun 2023 23:32:26 +0200 Subject: [PATCH 0508/1072] Updated to read only 1st object --- src/Storages/StorageAzure.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index 76e9130bda3..c8d13cbf242 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -490,13 +490,16 @@ Pipe StorageAzure::read( for (const auto & key : configuration.blobs_paths) objects.emplace_back(key); - auto reader = object_storage->readObjects(objects); - auto columns_description = storage_snapshot->getDescriptionForColumns(column_names); - auto block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); + if (objects.size() > 1) + { + auto reader = object_storage->readObject(objects[0]); + auto columns_description = storage_snapshot->getDescriptionForColumns(column_names); + auto block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); - pipes.emplace_back(std::make_shared(std::move(reader), context, block_for_format, max_block_size, columns_description)); - + pipes.emplace_back( + std::make_shared(std::move(reader), context, block_for_format, max_block_size, columns_description)); + } return Pipe::unitePipes(std::move(pipes)); } @@ -592,7 +595,9 @@ StorageAzureSource::StorageAzureSource (std::unique_ptr auto format = "TSV"; auto input_format = FormatFactory::instance().getInput( - format, *read_buffer, sample_block, getContext(), max_block_size); + format, *read_buffer, sample_block, getContext(), max_block_size, + FormatSettings(), std::nullopt, std::nullopt, + true); QueryPipelineBuilder builder; builder.init(Pipe(input_format)); @@ -611,6 +616,7 @@ StorageAzureSource::StorageAzureSource (std::unique_ptr Chunk StorageAzureSource::generate() { + LOG_INFO(&Poco::Logger::get("StorageAzureSource"), "generate"); while(true) { Chunk chunk; From 495482cdb2b6a6a2d272c50bb3995b0409f7fb91 Mon Sep 17 00:00:00 2001 From: tpanetti Date: Mon, 5 Jun 2023 15:22:29 -0700 Subject: [PATCH 0509/1072] Refactor ClickHouse->MySQL Type conversion and add configuration setting to trigger type conversion --- src/Core/Settings.h | 1 + src/DataTypes/DataTypeAggregateFunction.h | 2 +- src/DataTypes/DataTypeArray.h | 2 +- src/DataTypes/DataTypeDate.h | 2 +- src/DataTypes/DataTypeDate32.h | 2 +- src/DataTypes/DataTypeDateTime.h | 2 +- src/DataTypes/DataTypeDateTime64.h | 2 +- src/DataTypes/DataTypeEnum.cpp | 1 - src/DataTypes/DataTypeEnum.h | 3 +- src/DataTypes/DataTypeFixedString.h | 3 +- src/DataTypes/DataTypeFunction.h | 2 +- src/DataTypes/DataTypeIPv4andIPv6.h | 4 +- src/DataTypes/DataTypeInterval.h | 2 +- src/DataTypes/DataTypeLowCardinality.cpp | 3 +- src/DataTypes/DataTypeLowCardinality.h | 3 +- src/DataTypes/DataTypeMap.h | 2 +- src/DataTypes/DataTypeNothing.h | 2 +- src/DataTypes/DataTypeNullable.h | 2 +- src/DataTypes/DataTypeNumberBase.cpp | 67 +++++-- src/DataTypes/DataTypeNumberBase.h | 4 +- src/DataTypes/DataTypeObject.h | 2 +- src/DataTypes/DataTypeSet.h | 2 +- src/DataTypes/DataTypeString.h | 3 +- src/DataTypes/DataTypeTuple.h | 2 +- src/DataTypes/DataTypeUUID.h | 2 +- src/DataTypes/DataTypesDecimal.cpp | 5 + src/DataTypes/DataTypesDecimal.h | 3 +- src/DataTypes/IDataType.h | 12 +- src/Storages/System/StorageSystemColumns.cpp | 11 +- ...show_columns_mysql_compatibility.reference | 187 +++++++++++++++--- .../02775_show_columns_mysql_compatibility.sh | 31 ++- 31 files changed, 278 insertions(+), 93 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 67c92a0be8b..1ce30ff121f 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -190,6 +190,7 @@ class IColumn; M(Bool, allow_experimental_inverted_index, false, "If it is set to true, allow to use experimental inverted index.", 0) \ \ M(UInt64, mysql_max_rows_to_insert, 65536, "The maximum number of rows in MySQL batch insertion of the MySQL storage engine", 0) \ + M(Bool, output_format_mysql_types, false, "Use MySQL converted types when connected via MySQL compatibility", 0) \ \ M(UInt64, optimize_min_equality_disjunction_chain_length, 3, "The minimum length of the expression `expr = x1 OR ... expr = xN` for optimization ", 0) \ \ diff --git a/src/DataTypes/DataTypeAggregateFunction.h b/src/DataTypes/DataTypeAggregateFunction.h index 13ca3508580..83c9f10f407 100644 --- a/src/DataTypes/DataTypeAggregateFunction.h +++ b/src/DataTypes/DataTypeAggregateFunction.h @@ -45,7 +45,7 @@ public: String doGetName() const override; String getNameWithoutVersion() const; const char * getFamilyName() const override { return "AggregateFunction"; } - const char * getSQLCompatibleName() const override { return "TEXT"; } + String getSQLCompatibleName() const override { return "TEXT"; } TypeIndex getTypeId() const override { return TypeIndex::AggregateFunction; } Array getParameters() const { return parameters; } diff --git a/src/DataTypes/DataTypeArray.h b/src/DataTypes/DataTypeArray.h index 528062b60be..2714ca1d023 100644 --- a/src/DataTypes/DataTypeArray.h +++ b/src/DataTypes/DataTypeArray.h @@ -30,7 +30,7 @@ public: { return "Array"; } - const char * getSQLCompatibleName() const override + String getSQLCompatibleName() const override { return "TEXT"; } diff --git a/src/DataTypes/DataTypeDate.h b/src/DataTypes/DataTypeDate.h index 7b622ae04a3..0d557cad5f0 100644 --- a/src/DataTypes/DataTypeDate.h +++ b/src/DataTypes/DataTypeDate.h @@ -13,7 +13,7 @@ public: TypeIndex getTypeId() const override { return TypeIndex::Date; } const char * getFamilyName() const override { return family_name; } - const char * getSQLCompatibleName() const override { return "DATE"; } + String getSQLCompatibleName() const override { return "DATE"; } bool canBeUsedAsVersion() const override { return true; } bool canBeInsideNullable() const override { return true; } diff --git a/src/DataTypes/DataTypeDate32.h b/src/DataTypes/DataTypeDate32.h index 65b0ec7407e..0879a404179 100644 --- a/src/DataTypes/DataTypeDate32.h +++ b/src/DataTypes/DataTypeDate32.h @@ -13,7 +13,7 @@ public: TypeIndex getTypeId() const override { return TypeIndex::Date32; } const char * getFamilyName() const override { return family_name; } - const char * getSQLCompatibleName() const override { return "DATE"; } + String getSQLCompatibleName() const override { return "DATE"; } Field getDefault() const override { diff --git a/src/DataTypes/DataTypeDateTime.h b/src/DataTypes/DataTypeDateTime.h index 2facc758f90..edc8b016490 100644 --- a/src/DataTypes/DataTypeDateTime.h +++ b/src/DataTypes/DataTypeDateTime.h @@ -36,7 +36,7 @@ public: static constexpr auto family_name = "DateTime"; const char * getFamilyName() const override { return family_name; } - const char * getSQLCompatibleName() const override { return "DATETIME"; } + String getSQLCompatibleName() const override { return "DATETIME"; } String doGetName() const override; TypeIndex getTypeId() const override { return TypeIndex::DateTime; } diff --git a/src/DataTypes/DataTypeDateTime64.h b/src/DataTypes/DataTypeDateTime64.h index b836b84918f..e786cc09f28 100644 --- a/src/DataTypes/DataTypeDateTime64.h +++ b/src/DataTypes/DataTypeDateTime64.h @@ -28,7 +28,7 @@ public: DataTypeDateTime64(UInt32 scale_, const TimezoneMixin & time_zone_info); const char * getFamilyName() const override { return family_name; } - const char * getSQLCompatibleName() const override { return "DATETIME"; } + String getSQLCompatibleName() const override { return "DATETIME"; } std::string doGetName() const override; TypeIndex getTypeId() const override { return type_id; } diff --git a/src/DataTypes/DataTypeEnum.cpp b/src/DataTypes/DataTypeEnum.cpp index 24a3976179d..1750ae785bf 100644 --- a/src/DataTypes/DataTypeEnum.cpp +++ b/src/DataTypes/DataTypeEnum.cpp @@ -90,7 +90,6 @@ template DataTypeEnum::DataTypeEnum(const Values & values_) : EnumValues(values_) , type_name(generateName(this->getValues())) - , my_sql_type_name(generateMySQLName(this->getValues())) { } diff --git a/src/DataTypes/DataTypeEnum.h b/src/DataTypes/DataTypeEnum.h index 2cdaa2db06c..d148f753c82 100644 --- a/src/DataTypes/DataTypeEnum.h +++ b/src/DataTypes/DataTypeEnum.h @@ -45,7 +45,6 @@ public: private: std::string type_name; - std::string my_sql_type_name; static std::string generateName(const Values & values); static std::string generateMySQLName(const Values & values); @@ -54,7 +53,7 @@ public: std::string doGetName() const override { return type_name; } const char * getFamilyName() const override; - const char * getSQLCompatibleName() const override { return my_sql_type_name.c_str(); } + String getSQLCompatibleName() const override { return generateMySQLName(this->getValues()); } TypeIndex getTypeId() const override { return type_id; } diff --git a/src/DataTypes/DataTypeFixedString.h b/src/DataTypes/DataTypeFixedString.h index 2900efd5a34..22ec793208d 100644 --- a/src/DataTypes/DataTypeFixedString.h +++ b/src/DataTypes/DataTypeFixedString.h @@ -42,7 +42,8 @@ public: TypeIndex getTypeId() const override { return type_id; } const char * getFamilyName() const override { return "FixedString"; } - const char * getSQLCompatibleName() const override { return "TEXT"; } + /// Use TEXT for compatibility with MySQL to allow arbitrary bytes. + String getSQLCompatibleName() const override { return "TEXT"; } size_t getN() const { diff --git a/src/DataTypes/DataTypeFunction.h b/src/DataTypes/DataTypeFunction.h index df59f7738b2..b57c0587dde 100644 --- a/src/DataTypes/DataTypeFunction.h +++ b/src/DataTypes/DataTypeFunction.h @@ -24,7 +24,7 @@ public: std::string doGetName() const override; const char * getFamilyName() const override { return "Function"; } - const char * getSQLCompatibleName() const override { return "TEXT"; } + String getSQLCompatibleName() const override { return "TEXT"; } TypeIndex getTypeId() const override { return TypeIndex::Function; } const DataTypes & getArgumentTypes() const diff --git a/src/DataTypes/DataTypeIPv4andIPv6.h b/src/DataTypes/DataTypeIPv4andIPv6.h index be0ebb90f3c..487ce04f67c 100644 --- a/src/DataTypes/DataTypeIPv4andIPv6.h +++ b/src/DataTypes/DataTypeIPv4andIPv6.h @@ -19,7 +19,7 @@ public: static constexpr auto type_id = TypeToTypeIndex; const char * getFamilyName() const override { return TypeName.data(); } - const char * getSQLCompatibleName() const override { return "TEXT"; } + String getSQLCompatibleName() const override { return "TEXT"; } TypeIndex getTypeId() const override { return type_id; } @@ -61,7 +61,7 @@ public: static constexpr auto type_id = TypeToTypeIndex; const char * getFamilyName() const override { return TypeName.data(); } - const char * getSQLCompatibleName() const override { return "TEXT"; } + String getSQLCompatibleName() const override { return "TEXT"; } TypeIndex getTypeId() const override { return type_id; } diff --git a/src/DataTypes/DataTypeInterval.h b/src/DataTypes/DataTypeInterval.h index ee2157431dd..7de56c13b56 100644 --- a/src/DataTypes/DataTypeInterval.h +++ b/src/DataTypes/DataTypeInterval.h @@ -26,7 +26,7 @@ public: std::string doGetName() const override { return fmt::format("Interval{}", kind.toString()); } const char * getFamilyName() const override { return "Interval"; } - const char * getSQLCompatibleName() const override { return "TEXT"; } + String getSQLCompatibleName() const override { return "TEXT"; } TypeIndex getTypeId() const override { return TypeIndex::Interval; } bool equals(const IDataType & rhs) const override; diff --git a/src/DataTypes/DataTypeLowCardinality.cpp b/src/DataTypes/DataTypeLowCardinality.cpp index e59613e6974..8293455cabc 100644 --- a/src/DataTypes/DataTypeLowCardinality.cpp +++ b/src/DataTypes/DataTypeLowCardinality.cpp @@ -28,8 +28,7 @@ namespace ErrorCodes } DataTypeLowCardinality::DataTypeLowCardinality(DataTypePtr dictionary_type_) - : dictionary_type(std::move(dictionary_type_)), - mysql_name(dictionary_type->getSQLCompatibleName()) + : dictionary_type(std::move(dictionary_type_)) { auto inner_type = dictionary_type; if (dictionary_type->isNullable()) diff --git a/src/DataTypes/DataTypeLowCardinality.h b/src/DataTypes/DataTypeLowCardinality.h index 4dee8565568..f6d8d07a312 100644 --- a/src/DataTypes/DataTypeLowCardinality.h +++ b/src/DataTypes/DataTypeLowCardinality.h @@ -11,7 +11,6 @@ class DataTypeLowCardinality : public IDataType { private: DataTypePtr dictionary_type; - std::string mysql_name; public: @@ -24,7 +23,7 @@ public: return "LowCardinality(" + dictionary_type->getName() + ")"; } const char * getFamilyName() const override { return "LowCardinality"; } - const char * getSQLCompatibleName() const override { return mysql_name.c_str(); } + String getSQLCompatibleName() const override { return dictionary_type->getSQLCompatibleName(); } TypeIndex getTypeId() const override { return TypeIndex::LowCardinality; } diff --git a/src/DataTypes/DataTypeMap.h b/src/DataTypes/DataTypeMap.h index 299119f1759..294c5d7ac77 100644 --- a/src/DataTypes/DataTypeMap.h +++ b/src/DataTypes/DataTypeMap.h @@ -30,7 +30,7 @@ public: TypeIndex getTypeId() const override { return TypeIndex::Map; } std::string doGetName() const override; const char * getFamilyName() const override { return "Map"; } - const char * getSQLCompatibleName() const override { return "JSON"; } + String getSQLCompatibleName() const override { return "JSON"; } bool canBeInsideNullable() const override { return false; } diff --git a/src/DataTypes/DataTypeNothing.h b/src/DataTypes/DataTypeNothing.h index b35ced5dcb3..c3a7e2d09f0 100644 --- a/src/DataTypes/DataTypeNothing.h +++ b/src/DataTypes/DataTypeNothing.h @@ -16,7 +16,7 @@ public: static constexpr bool is_parametric = false; const char * getFamilyName() const override { return "Nothing"; } - const char * getSQLCompatibleName() const override { return "TEXT"; } + String getSQLCompatibleName() const override { return "TEXT"; } TypeIndex getTypeId() const override { return TypeIndex::Nothing; } diff --git a/src/DataTypes/DataTypeNullable.h b/src/DataTypes/DataTypeNullable.h index b5fe1bb2dd9..e3165414c07 100644 --- a/src/DataTypes/DataTypeNullable.h +++ b/src/DataTypes/DataTypeNullable.h @@ -16,7 +16,7 @@ public: explicit DataTypeNullable(const DataTypePtr & nested_data_type_); std::string doGetName() const override { return "Nullable(" + nested_data_type->getName() + ")"; } const char * getFamilyName() const override { return "Nullable"; } - const char * getSQLCompatibleName() const override { return nested_data_type->getSQLCompatibleName(); } + String getSQLCompatibleName() const override { return nested_data_type->getSQLCompatibleName(); } TypeIndex getTypeId() const override { return TypeIndex::Nullable; } MutableColumnPtr createColumn() const override; diff --git a/src/DataTypes/DataTypeNumberBase.cpp b/src/DataTypes/DataTypeNumberBase.cpp index db654448e83..e4c0fb96483 100644 --- a/src/DataTypes/DataTypeNumberBase.cpp +++ b/src/DataTypes/DataTypeNumberBase.cpp @@ -11,6 +11,55 @@ Field DataTypeNumberBase::getDefault() const { return NearestFieldType(); } +template +String DataTypeNumberBase::getSQLCompatibleName() const +{ + if constexpr (std::is_same_v) + { + return "TINYINT"; + } + else if constexpr (std::is_same_v) + { + return "SMALLINT"; + } + else if constexpr (std::is_same_v) + { + return "INTEGER"; + } + else if constexpr (std::is_same_v) + { + return "BIGINT"; + } + else if constexpr (std::is_same_v) + { + return "TINYINT UNSIGNED"; + } + else if constexpr (std::is_same_v) + { + return "SMALLINT UNSIGNED"; + } + else if constexpr (std::is_same_v) + { + return "INTEGER UNSIGNED"; + } + else if constexpr (std::is_same_v) + { + return "BIGINT UNSIGNED"; + } + else if constexpr (std::is_same_v) + { + return "FLOAT"; + } + else if constexpr (std::is_same_v) + { + return "DOUBLE"; + } + /// Unsupported types are converted to TEXT + else + { + return "TEXT"; + } +} template MutableColumnPtr DataTypeNumberBase::createColumn() const @@ -30,24 +79,6 @@ bool DataTypeNumberBase::isValueRepresentedByUnsignedInteger() const return is_integer && is_unsigned_v; } -template -const std::map DataTypeNumberBase::mysqlTypeMap = { - {"UInt8", "TINYINT UNSIGNED"}, - {"UInt16", "SMALLINT UNSIGNED"}, - {"UInt32", "MEDIUMINT UNSIGNEd"}, - {"UInt64", "BIGINT UNSIGNED"}, - {"UInt128", "TEXT"}, - {"UInt256", "TEXT"}, - {"Int8", "TINYINT"}, - {"Int16", "SMALLINT"}, - {"Int32", "INT"}, - {"Int64", "BIGINT"}, - {"Int128", "TEXT"}, - {"Int256", "TEXT"}, - {"Float32", "FLOAT"}, - {"Float64", "DOUBLE"}, -}; - /// Explicit template instantiations - to avoid code bloat in headers. template class DataTypeNumberBase; template class DataTypeNumberBase; diff --git a/src/DataTypes/DataTypeNumberBase.h b/src/DataTypes/DataTypeNumberBase.h index 1a855a974f0..d902c62505e 100644 --- a/src/DataTypes/DataTypeNumberBase.h +++ b/src/DataTypes/DataTypeNumberBase.h @@ -20,14 +20,12 @@ public: static constexpr bool is_parametric = false; static constexpr auto family_name = TypeName; static constexpr auto type_id = TypeToTypeIndex; - // Create a map from the name of the type to the name of the type in MySQL. - static const std::map mysqlTypeMap; using FieldType = T; using ColumnType = ColumnVector; const char * getFamilyName() const override { return TypeName.data(); } - const char * getSQLCompatibleName() const override { return mysqlTypeMap.at(TypeName.data()).c_str(); } + String getSQLCompatibleName() const override; TypeIndex getTypeId() const override { return TypeToTypeIndex; } Field getDefault() const override; diff --git a/src/DataTypes/DataTypeObject.h b/src/DataTypes/DataTypeObject.h index 618c7389758..2e1e5398f7e 100644 --- a/src/DataTypes/DataTypeObject.h +++ b/src/DataTypes/DataTypeObject.h @@ -23,7 +23,7 @@ public: DataTypeObject(const String & schema_format_, bool is_nullable_); const char * getFamilyName() const override { return "Object"; } - const char * getSQLCompatibleName() const override { return "JSON"; } + String getSQLCompatibleName() const override { return "JSON"; } String doGetName() const override; TypeIndex getTypeId() const override { return TypeIndex::Object; } diff --git a/src/DataTypes/DataTypeSet.h b/src/DataTypes/DataTypeSet.h index 916b4f071a5..d88d76b31be 100644 --- a/src/DataTypes/DataTypeSet.h +++ b/src/DataTypes/DataTypeSet.h @@ -15,7 +15,7 @@ class DataTypeSet final : public IDataTypeDummy public: static constexpr bool is_parametric = true; const char * getFamilyName() const override { return "Set"; } - const char * getSQLCompatibleName() const override { return "TEXT"; } + String getSQLCompatibleName() const override { return "TEXT"; } TypeIndex getTypeId() const override { return TypeIndex::Set; } bool equals(const IDataType & rhs) const override { return typeid(rhs) == typeid(*this); } diff --git a/src/DataTypes/DataTypeString.h b/src/DataTypes/DataTypeString.h index 338b3846266..c39fa90f6e7 100644 --- a/src/DataTypes/DataTypeString.h +++ b/src/DataTypes/DataTypeString.h @@ -21,8 +21,7 @@ public: return "String"; } - // FIXME: string can contain arbitrary bytes, not only UTF-8 sequences - const char * getSQLCompatibleName() const override { return "BLOB"; } + String getSQLCompatibleName() const override { return "BLOB"; } TypeIndex getTypeId() const override { return type_id; } diff --git a/src/DataTypes/DataTypeTuple.h b/src/DataTypes/DataTypeTuple.h index 93fa87b1332..ea05e6ae59b 100644 --- a/src/DataTypes/DataTypeTuple.h +++ b/src/DataTypes/DataTypeTuple.h @@ -33,7 +33,7 @@ public: TypeIndex getTypeId() const override { return TypeIndex::Tuple; } std::string doGetName() const override; const char * getFamilyName() const override { return "Tuple"; } - const char * getSQLCompatibleName() const override { return "JSON"; } + String getSQLCompatibleName() const override { return "JSON"; } bool canBeInsideNullable() const override { return false; } bool supportsSparseSerialization() const override { return true; } diff --git a/src/DataTypes/DataTypeUUID.h b/src/DataTypes/DataTypeUUID.h index bbf35074df3..8664c3bcfd1 100644 --- a/src/DataTypes/DataTypeUUID.h +++ b/src/DataTypes/DataTypeUUID.h @@ -18,7 +18,7 @@ public: static constexpr auto type_id = TypeIndex::UUID; const char * getFamilyName() const override { return "UUID"; } - const char * getSQLCompatibleName() const override { return "CHAR"; } + String getSQLCompatibleName() const override { return "CHAR"; } TypeIndex getTypeId() const override { return type_id; } diff --git a/src/DataTypes/DataTypesDecimal.cpp b/src/DataTypes/DataTypesDecimal.cpp index 1c2a63371ee..fa044d4ac9c 100644 --- a/src/DataTypes/DataTypesDecimal.cpp +++ b/src/DataTypes/DataTypesDecimal.cpp @@ -28,6 +28,11 @@ std::string DataTypeDecimal::doGetName() const return fmt::format("Decimal({}, {})", this->precision, this->scale); } +template +std::string DataTypeDecimal::getSQLCompatibleName() const +{ + return fmt::format("DECIMAL({}, {})", this->precision, this->scale); +} template bool DataTypeDecimal::equals(const IDataType & rhs) const diff --git a/src/DataTypes/DataTypesDecimal.h b/src/DataTypes/DataTypesDecimal.h index 6f3bf582aeb..5e4cfab7928 100644 --- a/src/DataTypes/DataTypesDecimal.h +++ b/src/DataTypes/DataTypesDecimal.h @@ -37,10 +37,9 @@ public: using Base::Base; static constexpr auto family_name = "Decimal"; - static constexpr auto mysql_name = "DECIMAL"; const char * getFamilyName() const override { return family_name; } - const char * getSQLCompatibleName() const override { return mysql_name; } + String getSQLCompatibleName() const override; std::string doGetName() const override; TypeIndex getTypeId() const override { return TypeToTypeIndex; } diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index 93fdbab05ef..51a9ecef0cc 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -71,19 +71,12 @@ public: return doGetName(); } - /// MySQL equivalent Name of data type (examples: UInt64, Array(String)). - String getMySQLTypeName() const - { - if (custom_name) - return custom_name->getName(); - else - return doGetMySQLName(); - } DataTypePtr getPtr() const { return shared_from_this(); } /// Name of data type family (example: FixedString, Array). virtual const char * getFamilyName() const = 0; - virtual const char * getSQLCompatibleName() const = 0; + /// Name of corresponding data type in MySQL (exampe: Bigint, Blob, etc) + virtual String getSQLCompatibleName() const = 0; /// Data type id. It's used for runtime type checks. virtual TypeIndex getTypeId() const = 0; @@ -135,7 +128,6 @@ public: protected: virtual String doGetName() const { return getFamilyName(); } - virtual String doGetMySQLName() const { return getSQLCompatibleName(); } virtual SerializationPtr doGetDefaultSerialization() const = 0; public: diff --git a/src/Storages/System/StorageSystemColumns.cpp b/src/Storages/System/StorageSystemColumns.cpp index f391a392dbb..684c35709a4 100644 --- a/src/Storages/System/StorageSystemColumns.cpp +++ b/src/Storages/System/StorageSystemColumns.cpp @@ -74,7 +74,8 @@ public: : ISource(header_) , columns_mask(std::move(columns_mask_)), max_block_size(max_block_size_) , databases(std::move(databases_)), tables(std::move(tables_)), storages(std::move(storages_)) - , clientInfo(context->getClientInfo()) + , client_info_interface(context->getClientInfo().interface) + , use_mysql_types(context->getSettingsRef().output_format_mysql_types) , total_tables(tables->size()), access(context->getAccess()) , query_id(context->getCurrentQueryId()), lock_acquire_timeout(context->getSettingsRef().lock_acquire_timeout) { @@ -132,9 +133,10 @@ protected: auto get_type_name = [this](const IDataType& type) -> std::string { - if (clientInfo.interface == DB::ClientInfo::Interface::MYSQL) + // Check if the output_format_mysql_types setting is enabled and client is connected via MySQL protocol + if (use_mysql_types && client_info_interface == DB::ClientInfo::Interface::MYSQL) { - return type.getMySQLTypeName(); + return type.getSQLCompatibleName(); } else { @@ -293,7 +295,8 @@ private: ColumnPtr databases; ColumnPtr tables; Storages storages; - ClientInfo clientInfo; + ClientInfo::Interface client_info_interface; + bool use_mysql_types; size_t db_table_num = 0; size_t total_tables; std::shared_ptr access; diff --git a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.reference b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.reference index 1742cd9c90c..68e7be9ae6f 100644 --- a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.reference +++ b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.reference @@ -4,6 +4,44 @@ Create pseudo-random database name Create tab duplicate table Run MySQL test field type null key default extra +aggregate_function AggregateFunction(sum, Int32) 0 NULL +array_value Array(Int32) 0 NULL +boolean_value UInt8 0 NULL +date32_value Date32 0 NULL +date_value Date 0 NULL +datetime64_value DateTime64(3) 0 NULL +datetime_value DateTime 0 NULL +decimal_value Decimal(10, 2) 0 NULL +enum_value Enum8('apple' = 1, 'banana' = 2, 'orange' = 3) 0 NULL +fixed_string_value FixedString(10) 0 NULL +float32 Float32 0 NULL +float64 Float64 0 NULL +int128 Int128 0 NULL +int16 Int16 0 NULL +int256 Int256 0 NULL +int32 Int32 0 NULL +int64 Int64 0 NULL +int8 Int8 0 NULL +ipv4_value IPv4 0 NULL +ipv6_value IPv6 0 NULL +json_value Object('json') 0 NULL +low_cardinality LowCardinality(String) 0 NULL +low_cardinality_date LowCardinality(DateTime) 0 NULL +map_value Map(String, Int32) 0 NULL +nested.nested_int Array(Int32) 0 NULL +nested.nested_string Array(String) 0 NULL +nint32 Nullable(Int32) 1 NULL +nullable_value Nullable(Int32) 1 NULL +string_value String 0 NULL +tuple_value Tuple(Int32, String) 0 NULL +uint128 UInt128 0 NULL +uint16 UInt16 0 NULL +uint256 UInt256 0 NULL +uint32 UInt32 0 NULL +uint64 UInt64 0 PRI SOR NULL +uint8 UInt8 0 NULL +uuid_value UUID 0 NULL +field type null key default extra aggregate_function TEXT 0 NULL array_value TEXT 0 NULL boolean_value TINYINT UNSIGNED 0 NULL @@ -11,12 +49,17 @@ date32_value DATE 0 NULL date_value DATE 0 NULL datetime64_value DATETIME 0 NULL datetime_value DATETIME 0 NULL -decimal_value DECIMAL 0 NULL +decimal_value DECIMAL(10, 2) 0 NULL enum_value ENUM('apple', 'banana', 'orange') 0 NULL fixed_string_value TEXT 0 NULL float32 FLOAT 0 NULL float64 DOUBLE 0 NULL -int32 INT 0 NULL +int128 TEXT 0 NULL +int16 SMALLINT 0 NULL +int256 TEXT 0 NULL +int32 INTEGER 0 NULL +int64 BIGINT 0 NULL +int8 TINYINT 0 NULL ipv4_value TEXT 0 NULL ipv6_value TEXT 0 NULL json_value JSON 0 NULL @@ -25,10 +68,16 @@ low_cardinality_date DATETIME 0 NULL map_value JSON 0 NULL nested.nested_int TEXT 0 NULL nested.nested_string TEXT 0 NULL -nullable_value INT 0 NULL +nint32 INTEGER 0 NULL +nullable_value INTEGER 0 NULL string_value BLOB 0 NULL tuple_value JSON 0 NULL +uint128 TEXT 0 NULL +uint16 SMALLINT UNSIGNED 0 NULL +uint256 TEXT 0 NULL +uint32 INTEGER UNSIGNED 0 NULL uint64 BIGINT UNSIGNED 0 PRI SOR NULL +uint8 TINYINT UNSIGNED 0 NULL uuid_value CHAR 0 NULL field type null key default extra aggregate_function TEXT 0 NULL @@ -38,12 +87,17 @@ date32_value DATE 0 NULL date_value DATE 0 NULL datetime64_value DATETIME 0 NULL datetime_value DATETIME 0 NULL -decimal_value DECIMAL 0 NULL +decimal_value DECIMAL(10, 2) 0 NULL enum_value ENUM('apple', 'banana', 'orange') 0 NULL fixed_string_value TEXT 0 NULL float32 FLOAT 0 NULL float64 DOUBLE 0 NULL -int32 INT 0 NULL +int128 TEXT 0 NULL +int16 SMALLINT 0 NULL +int256 TEXT 0 NULL +int32 INTEGER 0 NULL +int64 BIGINT 0 NULL +int8 TINYINT 0 NULL ipv4_value TEXT 0 NULL ipv6_value TEXT 0 NULL json_value JSON 0 NULL @@ -52,10 +106,16 @@ low_cardinality_date DATETIME 0 NULL map_value JSON 0 NULL nested.nested_int TEXT 0 NULL nested.nested_string TEXT 0 NULL -nullable_value INT 0 NULL +nint32 INTEGER 0 NULL +nullable_value INTEGER 0 NULL string_value BLOB 0 NULL tuple_value JSON 0 NULL +uint128 TEXT 0 NULL +uint16 SMALLINT UNSIGNED 0 NULL +uint256 TEXT 0 NULL +uint32 INTEGER UNSIGNED 0 NULL uint64 BIGINT UNSIGNED 0 PRI SOR NULL +uint8 TINYINT UNSIGNED 0 NULL uuid_value CHAR 0 NULL field type null key default extra collation comment privileges aggregate_function TEXT 0 NULL NULL @@ -65,12 +125,17 @@ date32_value DATE 0 NULL NULL date_value DATE 0 NULL NULL datetime64_value DATETIME 0 NULL NULL datetime_value DATETIME 0 NULL NULL -decimal_value DECIMAL 0 NULL NULL +decimal_value DECIMAL(10, 2) 0 NULL NULL enum_value ENUM('apple', 'banana', 'orange') 0 NULL NULL fixed_string_value TEXT 0 NULL NULL float32 FLOAT 0 NULL NULL float64 DOUBLE 0 NULL NULL -int32 INT 0 NULL NULL +int128 TEXT 0 NULL NULL +int16 SMALLINT 0 NULL NULL +int256 TEXT 0 NULL NULL +int32 INTEGER 0 NULL NULL +int64 BIGINT 0 NULL NULL +int8 TINYINT 0 NULL NULL ipv4_value TEXT 0 NULL NULL ipv6_value TEXT 0 NULL NULL json_value JSON 0 NULL NULL @@ -79,15 +144,32 @@ low_cardinality_date DATETIME 0 NULL NULL map_value JSON 0 NULL NULL nested.nested_int TEXT 0 NULL NULL nested.nested_string TEXT 0 NULL NULL -nullable_value INT 0 NULL NULL +nint32 INTEGER 0 NULL NULL +nullable_value INTEGER 0 NULL NULL string_value BLOB 0 NULL NULL tuple_value JSON 0 NULL NULL +uint128 TEXT 0 NULL NULL +uint16 SMALLINT UNSIGNED 0 NULL NULL +uint256 TEXT 0 NULL NULL +uint32 INTEGER UNSIGNED 0 NULL NULL uint64 BIGINT UNSIGNED 0 PRI SOR NULL NULL +uint8 TINYINT UNSIGNED 0 NULL NULL uuid_value CHAR 0 NULL NULL field type null key default extra -int32 INT 0 NULL +int128 TEXT 0 NULL +int16 SMALLINT 0 NULL +int256 TEXT 0 NULL +int32 INTEGER 0 NULL +int64 BIGINT 0 NULL +int8 TINYINT 0 NULL nested.nested_int TEXT 0 NULL +nint32 INTEGER 0 NULL +uint128 TEXT 0 NULL +uint16 SMALLINT UNSIGNED 0 NULL +uint256 TEXT 0 NULL +uint32 INTEGER UNSIGNED 0 NULL uint64 BIGINT UNSIGNED 0 PRI SOR NULL +uint8 TINYINT UNSIGNED 0 NULL field type null key default extra aggregate_function TEXT 0 NULL array_value TEXT 0 NULL @@ -96,7 +178,7 @@ date32_value DATE 0 NULL date_value DATE 0 NULL datetime64_value DATETIME 0 NULL datetime_value DATETIME 0 NULL -decimal_value DECIMAL 0 NULL +decimal_value DECIMAL(10, 2) 0 NULL enum_value ENUM('apple', 'banana', 'orange') 0 NULL fixed_string_value TEXT 0 NULL float32 FLOAT 0 NULL @@ -108,14 +190,25 @@ low_cardinality BLOB 0 NULL low_cardinality_date DATETIME 0 NULL map_value JSON 0 NULL nested.nested_string TEXT 0 NULL -nullable_value INT 0 NULL +nullable_value INTEGER 0 NULL string_value BLOB 0 NULL tuple_value JSON 0 NULL uuid_value CHAR 0 NULL field type null key default extra -int32 INT 0 NULL +int128 TEXT 0 NULL +int16 SMALLINT 0 NULL +int256 TEXT 0 NULL +int32 INTEGER 0 NULL +int64 BIGINT 0 NULL +int8 TINYINT 0 NULL nested.nested_int TEXT 0 NULL +nint32 INTEGER 0 NULL +uint128 TEXT 0 NULL +uint16 SMALLINT UNSIGNED 0 NULL +uint256 TEXT 0 NULL +uint32 INTEGER UNSIGNED 0 NULL uint64 BIGINT UNSIGNED 0 PRI SOR NULL +uint8 TINYINT UNSIGNED 0 NULL field type null key default extra aggregate_function TEXT 0 NULL array_value TEXT 0 NULL @@ -124,7 +217,7 @@ date32_value DATE 0 NULL date_value DATE 0 NULL datetime64_value DATETIME 0 NULL datetime_value DATETIME 0 NULL -decimal_value DECIMAL 0 NULL +decimal_value DECIMAL(10, 2) 0 NULL enum_value ENUM('apple', 'banana', 'orange') 0 NULL fixed_string_value TEXT 0 NULL float32 FLOAT 0 NULL @@ -136,14 +229,25 @@ low_cardinality BLOB 0 NULL low_cardinality_date DATETIME 0 NULL map_value JSON 0 NULL nested.nested_string TEXT 0 NULL -nullable_value INT 0 NULL +nullable_value INTEGER 0 NULL string_value BLOB 0 NULL tuple_value JSON 0 NULL uuid_value CHAR 0 NULL field type null key default extra -int32 INT 0 NULL +int128 TEXT 0 NULL +int16 SMALLINT 0 NULL +int256 TEXT 0 NULL +int32 INTEGER 0 NULL +int64 BIGINT 0 NULL +int8 TINYINT 0 NULL nested.nested_int TEXT 0 NULL +nint32 INTEGER 0 NULL +uint128 TEXT 0 NULL +uint16 SMALLINT UNSIGNED 0 NULL +uint256 TEXT 0 NULL +uint32 INTEGER UNSIGNED 0 NULL uint64 BIGINT UNSIGNED 0 PRI SOR NULL +uint8 TINYINT UNSIGNED 0 NULL field type null key default extra aggregate_function TEXT 0 NULL field type null key default extra @@ -154,12 +258,17 @@ date32_value DATE 0 NULL date_value DATE 0 NULL datetime64_value DATETIME 0 NULL datetime_value DATETIME 0 NULL -decimal_value DECIMAL 0 NULL +decimal_value DECIMAL(10, 2) 0 NULL enum_value ENUM('apple', 'banana', 'orange') 0 NULL fixed_string_value TEXT 0 NULL float32 FLOAT 0 NULL float64 DOUBLE 0 NULL -int32 INT 0 NULL +int128 TEXT 0 NULL +int16 SMALLINT 0 NULL +int256 TEXT 0 NULL +int32 INTEGER 0 NULL +int64 BIGINT 0 NULL +int8 TINYINT 0 NULL ipv4_value TEXT 0 NULL ipv6_value TEXT 0 NULL json_value JSON 0 NULL @@ -168,10 +277,16 @@ low_cardinality_date DATETIME 0 NULL map_value JSON 0 NULL nested.nested_int TEXT 0 NULL nested.nested_string TEXT 0 NULL -nullable_value INT 0 NULL +nint32 INTEGER 0 NULL +nullable_value INTEGER 0 NULL string_value BLOB 0 NULL tuple_value JSON 0 NULL +uint128 TEXT 0 NULL +uint16 SMALLINT UNSIGNED 0 NULL +uint256 TEXT 0 NULL +uint32 INTEGER UNSIGNED 0 NULL uint64 BIGINT UNSIGNED 0 PRI SOR NULL +uint8 TINYINT UNSIGNED 0 NULL uuid_value CHAR 0 NULL field type null key default extra aggregate_function TEXT 0 NULL @@ -181,12 +296,17 @@ date32_value DATE 0 NULL date_value DATE 0 NULL datetime64_value DATETIME 0 NULL datetime_value DATETIME 0 NULL -decimal_value DECIMAL 0 NULL +decimal_value DECIMAL(10, 2) 0 NULL enum_value ENUM('apple', 'banana', 'orange') 0 NULL fixed_string_value TEXT 0 NULL float32 FLOAT 0 NULL float64 DOUBLE 0 NULL -int32 INT 0 NULL +int128 TEXT 0 NULL +int16 SMALLINT 0 NULL +int256 TEXT 0 NULL +int32 INTEGER 0 NULL +int64 BIGINT 0 NULL +int8 TINYINT 0 NULL ipv4_value TEXT 0 NULL ipv6_value TEXT 0 NULL json_value JSON 0 NULL @@ -195,10 +315,16 @@ low_cardinality_date DATETIME 0 NULL map_value JSON 0 NULL nested.nested_int TEXT 0 NULL nested.nested_string TEXT 0 NULL -nullable_value INT 0 NULL +nint32 INTEGER 0 NULL +nullable_value INTEGER 0 NULL string_value BLOB 0 NULL tuple_value JSON 0 NULL +uint128 TEXT 0 NULL +uint16 SMALLINT UNSIGNED 0 NULL +uint256 TEXT 0 NULL +uint32 INTEGER UNSIGNED 0 NULL uint64 BIGINT UNSIGNED 0 PRI SOR NULL +uint8 TINYINT UNSIGNED 0 NULL uuid_value CHAR 0 NULL field type null key default extra aggregate_function TEXT 0 NULL @@ -208,12 +334,17 @@ date32_value DATE 0 NULL date_value DATE 0 NULL datetime64_value DATETIME 0 NULL datetime_value DATETIME 0 NULL -decimal_value DECIMAL 0 NULL +decimal_value DECIMAL(10, 2) 0 NULL enum_value ENUM('apple', 'banana', 'orange') 0 NULL fixed_string_value TEXT 0 NULL float32 FLOAT 0 NULL float64 DOUBLE 0 NULL -int32 INT 0 NULL +int128 TEXT 0 NULL +int16 SMALLINT 0 NULL +int256 TEXT 0 NULL +int32 INTEGER 0 NULL +int64 BIGINT 0 NULL +int8 TINYINT 0 NULL ipv4_value TEXT 0 NULL ipv6_value TEXT 0 NULL json_value JSON 0 NULL @@ -222,8 +353,14 @@ low_cardinality_date DATETIME 0 NULL map_value JSON 0 NULL nested.nested_int TEXT 0 NULL nested.nested_string TEXT 0 NULL -nullable_value INT 0 NULL +nint32 INTEGER 0 NULL +nullable_value INTEGER 0 NULL string_value BLOB 0 NULL tuple_value JSON 0 NULL +uint128 TEXT 0 NULL +uint16 SMALLINT UNSIGNED 0 NULL +uint256 TEXT 0 NULL +uint32 INTEGER UNSIGNED 0 NULL uint64 BIGINT UNSIGNED 0 PRI SOR NULL +uint8 TINYINT UNSIGNED 0 NULL uuid_value CHAR 0 NULL diff --git a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh index fd1ad92f060..938102cb5fc 100755 --- a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh +++ b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh @@ -17,15 +17,25 @@ ${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS tab" ${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS database_123456789abcde" ${CLICKHOUSE_LOCAL} --query "DROP TABLE IF EXISTS database_123456789abcde.tab" -#${CLICKHOUSE_LOCAL} --query "SET allow_suspicious_low_cardinality_types = 1;" echo "Create tab table " ${CLICKHOUSE_LOCAL} -n -q " SET allow_suspicious_low_cardinality_types=1; - SET allow_experimental_object_type =1; + SET allow_experimental_object_type=1; CREATE TABLE tab ( + uint8 UInt8, + uint16 UInt16, + uint32 UInt32, uint64 UInt64, - int32 Nullable(Int32), + uint128 UInt128, + uint256 UInt256, + int8 Int8, + int16 Int16, + int32 Int32, + int64 Int64, + int128 Int128, + int256 Int256, + nint32 Nullable(Int32), float32 Float32, float64 Float64, decimal_value Decimal(10, 2), @@ -67,8 +77,19 @@ ${CLICKHOUSE_LOCAL} -n -q " SET allow_experimental_object_type =1; CREATE TABLE database_123456789abcde.tab ( + uint8 UInt8, + uint16 UInt16, + uint32 UInt32, uint64 UInt64, - int32 Nullable(Int32), + uint128 UInt128, + uint256 UInt256, + int8 Int8, + int16 Int16, + int32 Int32, + int64 Int64, + int128 Int128, + int256 Int256, + nint32 Nullable(Int32), float32 Float32, float64 Float64, decimal_value Decimal(10, 2), @@ -105,6 +126,8 @@ TEMP_FILE=$(mktemp) cat < $TEMP_FILE SHOW COLUMNS FROM tab; +SET output_format_mysql_types=1; +SHOW COLUMNS FROM tab; SHOW EXTENDED COLUMNS FROM tab; SHOW FULL COLUMNS FROM tab; SHOW COLUMNS FROM tab LIKE '%int%'; From cf9936ad327b4f94903e5993a1189c149c9c61dd Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 5 Jun 2023 23:58:43 +0000 Subject: [PATCH 0510/1072] Automatic style fix --- .../test_storage_azure_blob_storage/test.py | 91 ++++++++++++++----- 1 file changed, 68 insertions(+), 23 deletions(-) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index e78fa185b17..6a3c915cdc5 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -17,6 +17,7 @@ from helpers.network import PartitionManager from helpers.mock_servers import start_mock_servers from helpers.test_tools import exec_query_with_retry + @pytest.fixture(scope="module") def cluster(): try: @@ -32,6 +33,7 @@ def cluster(): finally: cluster.shutdown() + def azure_query(node, query, try_num=3, settings={}): for i in range(try_num): try: @@ -50,6 +52,7 @@ def azure_query(node, query, try_num=3, settings={}): raise Exception(ex) continue + def get_azure_file_content(filename): container_name = "cont" connection_string = "DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;" @@ -57,43 +60,72 @@ def get_azure_file_content(filename): container_client = blob_service_client.get_container_client(container_name) blob_client = container_client.get_blob_client(filename) download_stream = blob_client.download_blob() - return download_stream.readall().decode('utf-8') + return download_stream.readall().decode("utf-8") + def test_create_table_connection_string(cluster): node = cluster.instances["node"] - azure_query(node, "CREATE TABLE test_create_table_conn_string (key UInt64, data String) Engine = Azure('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1/;', 'cont', 'test_create_connection_string', 'CSV')") + azure_query( + node, + "CREATE TABLE test_create_table_conn_string (key UInt64, data String) Engine = Azure('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1/;', 'cont', 'test_create_connection_string', 'CSV')", + ) + def test_create_table_account_string(cluster): node = cluster.instances["node"] - azure_query(node, "CREATE TABLE test_create_table_account_url (key UInt64, data String) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', 'test_create_connection_string', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV')") + azure_query( + node, + "CREATE TABLE test_create_table_account_url (key UInt64, data String) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', 'test_create_connection_string', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV')", + ) + def test_simple_write_account_string(cluster): node = cluster.instances["node"] - azure_query(node, "CREATE TABLE test_simple_write (key UInt64, data String) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', 'test_simple_write.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV')") + azure_query( + node, + "CREATE TABLE test_simple_write (key UInt64, data String) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', 'test_simple_write.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV')", + ) azure_query(node, "INSERT INTO test_simple_write VALUES (1, 'a')") - print(get_azure_file_content('test_simple_write.csv')) - assert get_azure_file_content('test_simple_write.csv') == '1,"a"\n' + print(get_azure_file_content("test_simple_write.csv")) + assert get_azure_file_content("test_simple_write.csv") == '1,"a"\n' + def test_simple_write_connection_string(cluster): node = cluster.instances["node"] - azure_query(node, "CREATE TABLE test_simple_write_connection_string (key UInt64, data String) Engine = Azure('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1;', 'cont', 'test_simple_write_c.csv', 'CSV')") + azure_query( + node, + "CREATE TABLE test_simple_write_connection_string (key UInt64, data String) Engine = Azure('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1;', 'cont', 'test_simple_write_c.csv', 'CSV')", + ) azure_query(node, "INSERT INTO test_simple_write_connection_string VALUES (1, 'a')") - print(get_azure_file_content('test_simple_write_c.csv')) - assert get_azure_file_content('test_simple_write_c.csv') == '1,"a"\n' + print(get_azure_file_content("test_simple_write_c.csv")) + assert get_azure_file_content("test_simple_write_c.csv") == '1,"a"\n' + def test_simple_write_named_collection_1(cluster): node = cluster.instances["node"] - azure_query(node, "CREATE TABLE test_simple_write_named_collection_1 (key UInt64, data String) Engine = Azure(azure_conf1)") - azure_query(node, "INSERT INTO test_simple_write_named_collection_1 VALUES (1, 'a')") - print(get_azure_file_content('test_simple_write_named.csv')) - assert get_azure_file_content('test_simple_write_named.csv') == '1,"a"\n' + azure_query( + node, + "CREATE TABLE test_simple_write_named_collection_1 (key UInt64, data String) Engine = Azure(azure_conf1)", + ) + azure_query( + node, "INSERT INTO test_simple_write_named_collection_1 VALUES (1, 'a')" + ) + print(get_azure_file_content("test_simple_write_named.csv")) + assert get_azure_file_content("test_simple_write_named.csv") == '1,"a"\n' + def test_simple_write_named_collection_2(cluster): node = cluster.instances["node"] - azure_query(node, "CREATE TABLE test_simple_write_named_collection_2 (key UInt64, data String) Engine = Azure(azure_conf2, container='cont', blob_path='test_simple_write_named_2.csv', format='CSV')") - azure_query(node, "INSERT INTO test_simple_write_named_collection_2 VALUES (1, 'a')") - print(get_azure_file_content('test_simple_write_named_2.csv')) - assert get_azure_file_content('test_simple_write_named_2.csv') == '1,"a"\n' + azure_query( + node, + "CREATE TABLE test_simple_write_named_collection_2 (key UInt64, data String) Engine = Azure(azure_conf2, container='cont', blob_path='test_simple_write_named_2.csv', format='CSV')", + ) + azure_query( + node, "INSERT INTO test_simple_write_named_collection_2 VALUES (1, 'a')" + ) + print(get_azure_file_content("test_simple_write_named_2.csv")) + assert get_azure_file_content("test_simple_write_named_2.csv") == '1,"a"\n' + def test_partition_by(cluster): node = cluster.instances["node"] @@ -102,7 +134,10 @@ def test_partition_by(cluster): values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)" filename = "test_{_partition_id}.csv" - azure_query(node, f"CREATE TABLE test_partitioned_write ({table_format}) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV') PARTITION BY {partition_by}") + azure_query( + node, + f"CREATE TABLE test_partitioned_write ({table_format}) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV') PARTITION BY {partition_by}", + ) azure_query(node, f"INSERT INTO test_partitioned_write VALUES {values}") assert "1,2,3\n" == get_azure_file_content("test_3.csv") @@ -116,7 +151,10 @@ def test_partition_by_string_column(cluster): partition_by = "col_str" values = "(1, 'foo/bar'), (3, 'йцук'), (78, '你好')" filename = "test_{_partition_id}.csv" - azure_query(node, f"CREATE TABLE test_partitioned_string_write ({table_format}) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV') PARTITION BY {partition_by}") + azure_query( + node, + f"CREATE TABLE test_partitioned_string_write ({table_format}) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV') PARTITION BY {partition_by}", + ) azure_query(node, f"INSERT INTO test_partitioned_string_write VALUES {values}") assert '1,"foo/bar"\n' == get_azure_file_content("test_foo/bar.csv") @@ -131,15 +169,22 @@ def test_partition_by_const_column(cluster): partition_by = "'88'" values_csv = "1,2,3\n3,2,1\n78,43,45\n" filename = "test_{_partition_id}.csv" - azure_query(node, f"CREATE TABLE test_partitioned_const_write ({table_format}) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV') PARTITION BY {partition_by}") + azure_query( + node, + f"CREATE TABLE test_partitioned_const_write ({table_format}) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV') PARTITION BY {partition_by}", + ) azure_query(node, f"INSERT INTO test_partitioned_const_write VALUES {values}") assert values_csv == get_azure_file_content("test_88.csv") + def test_truncate(cluster): node = cluster.instances["node"] - azure_query(node, "CREATE TABLE test_truncate (key UInt64, data String) Engine = Azure(azure_conf2, container='cont', blob_path='test_truncate.csv', format='CSV')") + azure_query( + node, + "CREATE TABLE test_truncate (key UInt64, data String) Engine = Azure(azure_conf2, container='cont', blob_path='test_truncate.csv', format='CSV')", + ) azure_query(node, "INSERT INTO test_truncate VALUES (1, 'a')") - assert get_azure_file_content('test_truncate.csv') == '1,"a"\n' + assert get_azure_file_content("test_truncate.csv") == '1,"a"\n' azure_query(node, "TRUNCATE TABLE test_truncate") with pytest.raises(Exception): - print(get_azure_file_content('test_truncate.csv')) + print(get_azure_file_content("test_truncate.csv")) From 9f80900d6f587383780d2a40f8173093dce68a5a Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Thu, 1 Jun 2023 22:02:17 +0000 Subject: [PATCH 0511/1072] Changes related to an internal feature --- src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp | 2 +- src/Disks/IO/ReadBufferFromRemoteFSGather.cpp | 2 +- src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp | 7 +++++-- src/Disks/ObjectStorages/Cached/CachedObjectStorage.h | 4 +++- src/IO/ReadSettings.h | 2 ++ src/Interpreters/Cache/FileSegment.h | 2 +- src/Interpreters/Cache/IFileCachePriority.h | 1 + src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp | 7 +++++++ src/Storages/MergeTree/DataPartStorageOnDiskBase.h | 1 + src/Storages/MergeTree/IDataPartStorage.h | 1 + .../configs/config.d/storage_conf.xml | 1 + .../configs/config.d/users.xml | 7 +++++++ .../test_replicated_merge_tree_s3_zero_copy/test.py | 8 ++++++-- 13 files changed, 37 insertions(+), 8 deletions(-) create mode 100644 tests/integration/test_replicated_merge_tree_s3_zero_copy/configs/config.d/users.xml diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index 7c497baa450..877d8ff9bb7 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -1219,7 +1219,7 @@ off_t CachedOnDiskReadBufferFromFile::getPosition() void CachedOnDiskReadBufferFromFile::assertCorrectness() const { - if (!CachedObjectStorage::canUseReadThroughCache() + if (!CachedObjectStorage::canUseReadThroughCache(settings) && !settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache usage is not allowed (query_id: {})", query_id); } diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp index 12fbbbcf747..04030fe5f8f 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp @@ -36,7 +36,7 @@ ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather( with_cache = settings.remote_fs_cache && settings.enable_filesystem_cache - && (!query_id.empty() || settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache); + && (!query_id.empty() || settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache || !settings.avoid_readthrough_cache_outside_query_context); } SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(const StoredObject & object) diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp index 1d24d9d5411..3e73e45638b 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp @@ -57,7 +57,7 @@ ReadSettings CachedObjectStorage::patchSettings(const ReadSettings & read_settin ReadSettings modified_settings{read_settings}; modified_settings.remote_fs_cache = cache; - if (!canUseReadThroughCache()) + if (!canUseReadThroughCache(read_settings)) modified_settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache = true; return object_storage->patchSettings(modified_settings); @@ -227,8 +227,11 @@ String CachedObjectStorage::getObjectsNamespace() const return object_storage->getObjectsNamespace(); } -bool CachedObjectStorage::canUseReadThroughCache() +bool CachedObjectStorage::canUseReadThroughCache(const ReadSettings & settings) { + if (!settings.avoid_readthrough_cache_outside_query_context) + return true; + return CurrentThread::isInitialized() && CurrentThread::get().getQueryContext() && !CurrentThread::getQueryId().empty(); diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h index b5186d39c32..ba9fbd02d94 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h @@ -112,7 +112,9 @@ public: WriteSettings getAdjustedSettingsFromMetadataFile(const WriteSettings & settings, const std::string & path) const override; - static bool canUseReadThroughCache(); + const FileCacheSettings & getCacheSettings() const { return cache_settings; } + + static bool canUseReadThroughCache(const ReadSettings & settings); private: FileCache::Key getCacheKey(const std::string & path) const; diff --git a/src/IO/ReadSettings.h b/src/IO/ReadSettings.h index e43ecd7f275..dae4261e92c 100644 --- a/src/IO/ReadSettings.h +++ b/src/IO/ReadSettings.h @@ -99,6 +99,8 @@ struct ReadSettings bool read_from_filesystem_cache_if_exists_otherwise_bypass_cache = false; bool enable_filesystem_cache_log = false; bool is_file_cache_persistent = false; /// Some files can be made non-evictable. + /// Don't populate cache when the read is not part of query execution (e.g. background thread). + bool avoid_readthrough_cache_outside_query_context = true; size_t filesystem_cache_max_download_size = (128UL * 1024 * 1024 * 1024); bool skip_download_if_exceeds_query_cache = true; diff --git a/src/Interpreters/Cache/FileSegment.h b/src/Interpreters/Cache/FileSegment.h index 163a15fcfda..75395a671f4 100644 --- a/src/Interpreters/Cache/FileSegment.h +++ b/src/Interpreters/Cache/FileSegment.h @@ -85,7 +85,7 @@ public: EMPTY, /** * A newly created file segment never has DOWNLOADING state until call to getOrSetDownloader - * because each cache user might acquire multiple file segments and reads them one by one, + * because each cache user might acquire multiple file segments and read them one by one, * so only user which actually needs to read this segment earlier than others - becomes a downloader. */ DOWNLOADING, diff --git a/src/Interpreters/Cache/IFileCachePriority.h b/src/Interpreters/Cache/IFileCachePriority.h index ad63dcc7ea5..93343398783 100644 --- a/src/Interpreters/Cache/IFileCachePriority.h +++ b/src/Interpreters/Cache/IFileCachePriority.h @@ -85,6 +85,7 @@ public: virtual void removeAll(const CacheGuard::Lock &) = 0; + /// From lowest to highest priority. virtual void iterate(IterateFunc && func, const CacheGuard::Lock &) = 0; private: diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp index cfc3ff58f81..30776a8bc50 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp @@ -202,6 +202,13 @@ bool DataPartStorageOnDiskBase::isStoredOnRemoteDisk() const return volume->getDisk()->isRemote(); } +std::optional DataPartStorageOnDiskBase::getCacheName() const +{ + if (volume->getDisk()->supportsCache()) + return volume->getDisk()->getCacheName(); + return std::nullopt; +} + bool DataPartStorageOnDiskBase::supportZeroCopyReplication() const { return volume->getDisk()->supportZeroCopyReplication(); diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h index 6b27b7296fc..043953eb20c 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h @@ -36,6 +36,7 @@ public: std::string getDiskName() const override; std::string getDiskType() const override; bool isStoredOnRemoteDisk() const override; + std::optional getCacheName() const override; bool supportZeroCopyReplication() const override; bool supportParallelWrite() const override; bool isBroken() const override; diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h index f160254350d..933c9bd9958 100644 --- a/src/Storages/MergeTree/IDataPartStorage.h +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -149,6 +149,7 @@ public: virtual std::string getDiskName() const = 0; virtual std::string getDiskType() const = 0; virtual bool isStoredOnRemoteDisk() const { return false; } + virtual std::optional getCacheName() const { return std::nullopt; } virtual bool supportZeroCopyReplication() const { return false; } virtual bool supportParallelWrite() const = 0; virtual bool isBroken() const = 0; diff --git a/tests/integration/test_replicated_merge_tree_s3_zero_copy/configs/config.d/storage_conf.xml b/tests/integration/test_replicated_merge_tree_s3_zero_copy/configs/config.d/storage_conf.xml index 15239041478..96d59d5633e 100644 --- a/tests/integration/test_replicated_merge_tree_s3_zero_copy/configs/config.d/storage_conf.xml +++ b/tests/integration/test_replicated_merge_tree_s3_zero_copy/configs/config.d/storage_conf.xml @@ -12,6 +12,7 @@ s3 100000000 ./cache_s3/ + 1 diff --git a/tests/integration/test_replicated_merge_tree_s3_zero_copy/configs/config.d/users.xml b/tests/integration/test_replicated_merge_tree_s3_zero_copy/configs/config.d/users.xml new file mode 100644 index 00000000000..5de169edc1e --- /dev/null +++ b/tests/integration/test_replicated_merge_tree_s3_zero_copy/configs/config.d/users.xml @@ -0,0 +1,7 @@ + + + + 1 + + + diff --git a/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py b/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py index eca18820016..72a01d278d8 100644 --- a/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py +++ b/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py @@ -19,6 +19,7 @@ def cluster(): cluster.add_instance( "node1", main_configs=["configs/config.d/storage_conf.xml"], + user_configs=["configs/config.d/users.xml"], macros={"replica": "1"}, with_minio=True, with_zookeeper=True, @@ -26,12 +27,14 @@ def cluster(): cluster.add_instance( "node2", main_configs=["configs/config.d/storage_conf.xml"], + user_configs=["configs/config.d/users.xml"], macros={"replica": "2"}, with_zookeeper=True, ) cluster.add_instance( "node3", main_configs=["configs/config.d/storage_conf.xml"], + user_configs=["configs/config.d/users.xml"], macros={"replica": "3"}, with_zookeeper=True, ) @@ -74,7 +77,7 @@ def generate_values(date_str, count, sign=1): def create_table(cluster, additional_settings=None): create_table_statement = """ - CREATE TABLE s3_test ON CLUSTER cluster( + CREATE TABLE s3_test ON CLUSTER cluster ( dt Date, id Int64, data String, @@ -95,7 +98,8 @@ def create_table(cluster, additional_settings=None): def drop_table(cluster): yield for node in list(cluster.instances.values()): - node.query("DROP TABLE IF EXISTS s3_test") + node.query("DROP TABLE IF EXISTS s3_test SYNC") + node.query("DROP TABLE IF EXISTS test_drop_table SYNC") minio = cluster.minio_client # Remove extra objects to prevent tests cascade failing From 6d25e5a0d75325ddeee2be0d689da6ea4395fccc Mon Sep 17 00:00:00 2001 From: Victor Krasnov Date: Tue, 6 Jun 2023 07:37:14 +0300 Subject: [PATCH 0512/1072] Substitute missing year in parseDateTimeBestEffortImpl() --- src/IO/parseDateTimeBestEffort.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/IO/parseDateTimeBestEffort.cpp b/src/IO/parseDateTimeBestEffort.cpp index b370bee6f3f..f753e3c0b4e 100644 --- a/src/IO/parseDateTimeBestEffort.cpp +++ b/src/IO/parseDateTimeBestEffort.cpp @@ -578,12 +578,16 @@ ReturnType parseDateTimeBestEffortImpl( if (!year && !month && !day_of_month && !has_time) return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: neither Date nor Time was parsed successfully"); - if (!year) - year = 2000; - if (!month) - month = 1; if (!day_of_month) day_of_month = 1; + if (!month) + month = 1; + if (!year) + { + time_t now = time(nullptr); + UInt16 curr_year = local_time_zone.toYear(now); + year = now < local_time_zone.makeDateTime(year, month, day_of_month, hour, minute, second) ? curr_year - 1 : curr_year; + } auto is_leap_year = (year % 400 == 0) || (year % 100 != 0 && year % 4 == 0); From 614451998d60d4ffcb1e06096ab64882dc97d7ea Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 6 Jun 2023 08:06:24 +0200 Subject: [PATCH 0513/1072] Fixed build by adding StorageAzureSource constructor and getHeader function implementation --- src/Storages/StorageAzure.cpp | 49 +++++++++++++++++++++++++++++++++++ src/Storages/StorageAzure.h | 16 ++++++------ 2 files changed, 57 insertions(+), 8 deletions(-) diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index 166c1156c49..3e29fa43383 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -43,6 +43,12 @@ using namespace Azure::Storage::Blobs; +namespace CurrentMetrics +{ + extern const Metric ObjectStorageAzureThreads; + extern const Metric ObjectStorageAzureThreadsActive; +} + namespace DB { @@ -1038,6 +1044,49 @@ Chunk StorageAzureSource::generate() return {}; } +Block StorageAzureSource::getHeader(Block sample_block, const std::vector & requested_virtual_columns) +{ + for (const auto & virtual_column : requested_virtual_columns) + sample_block.insert({virtual_column.type->createColumn(), virtual_column.type, virtual_column.name}); + + return sample_block; +} + +StorageAzureSource::StorageAzureSource( + const std::vector & requested_virtual_columns_, + const String & format_, + String name_, + const Block & sample_block_, + ContextPtr context_, + std::optional format_settings_, + const ColumnsDescription & columns_, + UInt64 max_block_size_, + String compression_hint_, + AzureObjectStorage * object_storage_, + const String & container_, + std::shared_ptr file_iterator_) + :ISource(getHeader(sample_block_, requested_virtual_columns_)) + , WithContext(context_) + , requested_virtual_columns(requested_virtual_columns_) + , format(format_) + , name(std::move(name_)) + , sample_block(sample_block_) + , format_settings(format_settings_) + , columns_desc(columns_) + , max_block_size(max_block_size_) + , compression_hint(compression_hint_) + , object_storage(std::move(object_storage_)) + , container(container_) + , file_iterator(file_iterator_) + , create_reader_pool(CurrentMetrics::ObjectStorageAzureThreads, CurrentMetrics::ObjectStorageAzureThreadsActive, 1) + , create_reader_scheduler(threadPoolCallbackRunner(create_reader_pool, "CreateAzureReader")) +{ + reader = createReader(); + if (reader) + reader_future = createReaderAsync(); +} + + StorageAzureSource::~StorageAzureSource() { create_reader_pool.wait(); diff --git a/src/Storages/StorageAzure.h b/src/Storages/StorageAzure.h index b7f5eba343f..168015cf5d9 100644 --- a/src/Storages/StorageAzure.h +++ b/src/Storages/StorageAzure.h @@ -198,17 +198,17 @@ public: static Block getHeader(Block sample_block, const std::vector & requested_virtual_columns); private: - String name; - String container; - String format; std::vector requested_virtual_columns; - ColumnsDescription columns_desc; - AzureObjectStorage * object_storage; - std::shared_ptr file_iterator; - UInt64 max_block_size; - String compression_hint; + String format; + String name; Block sample_block; std::optional format_settings; + ColumnsDescription columns_desc; + UInt64 max_block_size; + String compression_hint; + AzureObjectStorage * object_storage; + String container; + std::shared_ptr file_iterator; struct ReaderHolder { From 0bc31a72888da7ccf50caee845abef4141077220 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 6 Jun 2023 07:17:30 +0000 Subject: [PATCH 0514/1072] Fix jepsen runs in PRs --- tests/ci/jepsen_check.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/ci/jepsen_check.py b/tests/ci/jepsen_check.py index 9d35d2d6e35..c21fafa2605 100644 --- a/tests/ci/jepsen_check.py +++ b/tests/ci/jepsen_check.py @@ -25,6 +25,7 @@ from stopwatch import Stopwatch from tee_popen import TeePopen from upload_result_helper import upload_results from version_helper import get_version_from_repo +from build_check import get_release_or_pr JEPSEN_GROUP_NAME = "jepsen_group" @@ -210,12 +211,7 @@ if __name__ == "__main__": build_name = get_build_name_for_check(check_name) - if pr_info.number == 0: - version = get_version_from_repo() - release_or_pr = f"{version.major}.{version.minor}" - else: - # PR number for anything else - release_or_pr = str(pr_info.number) + release_or_pr, _ = get_release_or_pr(pr_info, get_version_from_repo()) # This check run separately from other checks because it requires exclusive # run (see .github/workflows/jepsen.yml) So we cannot add explicit From ce8b39487e5416b018ce9d03feef4a8114e04f9b Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 6 Jun 2023 09:55:50 +0200 Subject: [PATCH 0515/1072] Update docs/en/engines/table-engines/mergetree-family/annindexes.md Co-authored-by: Nikita Taranov --- docs/en/engines/table-engines/mergetree-family/annindexes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 58655c11321..f600f9a015c 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -54,7 +54,7 @@ ENGINE = MergeTree ORDER BY id; ``` -ANN indexes are built during column insertion and merge and `INSERT` and `OPTIMIZE` statements will slower than for ordinary tables. ANNIndexes are ideally used only with immutable or rarely changed data, respectively comparatively many more read requests than write requests. +ANN indexes are built during column insertion and merge and `INSERT` and `OPTIMIZE` statements will be slower than for ordinary tables. ANNIndexes are ideally used only with immutable or rarely changed data, respectively there are much more read requests than write requests. Similar to regular skip indexes, ANN indexes are constructed over granules and each indexed block consists of `GRANULARITY = `-many granules. For example, if the primary index granularity of the table is 8192 (setting `index_granularity = 8192`) and `GRANULARITY = 2`, From 0b18b75bec6f29e687b6d9cce91fc2cc6c906221 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 6 Jun 2023 08:06:03 +0000 Subject: [PATCH 0516/1072] Cosmetics: Use abbreviated syntax to read settings --- .../MergeTree/ApproximateNearestNeighborIndexesCommon.cpp | 4 ++-- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp index c47e53788a7..bf277c55863 100644 --- a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp +++ b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp @@ -53,8 +53,8 @@ ApproximateNearestNeighborInformation::Metric stringToMetric(std::string_view me ApproximateNearestNeighborCondition::ApproximateNearestNeighborCondition(const SelectQueryInfo & query_info, ContextPtr context) : block_with_constants(KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)) - , index_granularity(context->getMergeTreeSettings().get("index_granularity").get()) - , max_limit_for_ann_queries(context->getSettings().get("max_limit_for_ann_queries").get()) + , index_granularity(context->getMergeTreeSettings().index_granularity) + , max_limit_for_ann_queries(context->getSettings().max_limit_for_ann_queries) , index_is_useful(checkQueryStructure(query_info)) {} diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 72dd92ead5e..0a2df639b69 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -210,7 +210,7 @@ MergeTreeIndexConditionAnnoy::MergeTreeIndexConditionAnnoy( ContextPtr context) : ann_condition(query, context) , distance_function(distance_function_) - , search_k(context->getSettings().get("annoy_index_search_k_nodes").get()) + , search_k(context->getSettings().annoy_index_search_k_nodes) {} bool MergeTreeIndexConditionAnnoy::mayBeTrueOnGranule(MergeTreeIndexGranulePtr /*idx_granule*/) const From 2c5c0c5c9f82963903a618e7686c39a59899f020 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 6 Jun 2023 08:14:50 +0000 Subject: [PATCH 0517/1072] Cosmetics: Remove exception path for something checked elsewhere already --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 0a2df639b69..b15b1bb1a91 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -229,8 +229,7 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRanges(MergeTreeIndex return getUsefulRangesImpl(idx_granule); else if (distance_function == "cosineDistance") return getUsefulRangesImpl(idx_granule); - else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown distance name. Must be 'L2Distance' or 'cosineDistance'. Got {}", distance_function); + std::unreachable(); } template From 3f77b778e368c1c5e3cd3012d54107193094502b Mon Sep 17 00:00:00 2001 From: Victor Krasnov Date: Tue, 6 Jun 2023 11:42:45 +0300 Subject: [PATCH 0518/1072] Fix runtime bug --- src/IO/parseDateTimeBestEffort.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IO/parseDateTimeBestEffort.cpp b/src/IO/parseDateTimeBestEffort.cpp index f753e3c0b4e..6bdba251c36 100644 --- a/src/IO/parseDateTimeBestEffort.cpp +++ b/src/IO/parseDateTimeBestEffort.cpp @@ -586,7 +586,7 @@ ReturnType parseDateTimeBestEffortImpl( { time_t now = time(nullptr); UInt16 curr_year = local_time_zone.toYear(now); - year = now < local_time_zone.makeDateTime(year, month, day_of_month, hour, minute, second) ? curr_year - 1 : curr_year; + year = now < local_time_zone.makeDateTime(curr_year, month, day_of_month, hour, minute, second) ? curr_year - 1 : curr_year; } auto is_leap_year = (year % 400 == 0) || (year % 100 != 0 && year % 4 == 0); From 1506545db0f062787ac779f997e30b0e07a55736 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 6 Jun 2023 11:47:29 +0200 Subject: [PATCH 0519/1072] Fix merge conflicts --- src/Storages/StorageAzure.cpp | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index 3e29fa43383..1e128bfed66 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -230,13 +230,10 @@ StorageAzure::Configuration StorageAzure::getConfiguration(ASTs & engine_args, C configuration.blobs_paths = {configuration.blob_path}; - LOG_INFO(&Poco::Logger::get("StorageAzure"), "get_format_from_file = {}", get_format_from_file); -// if (configuration.format == "auto" && get_format_from_file) -// configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); - - configuration.format = "TSV"; + if (configuration.format == "auto" && get_format_from_file) + configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); return configuration; } @@ -367,11 +364,6 @@ AzureClientPtr StorageAzure::createClient(StorageAzure::Configuration configurat } } } - auto managed_identity_credential = std::make_shared(); - - result = std::make_unique(configuration.connection_url, managed_identity_credential); - - LOG_INFO(&Poco::Logger::get("StorageAzure"), "createClient account_name & account_key "); } return result; @@ -613,7 +605,7 @@ Pipe StorageAzure::read( size_t num_streams) { if (partition_by && configuration.withWildcard()) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Reading from a partitioned S3 storage is not implemented yet"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Reading from a partitioned Azure storage is not implemented yet"); Pipes pipes; @@ -1079,7 +1071,7 @@ StorageAzureSource::StorageAzureSource( , container(container_) , file_iterator(file_iterator_) , create_reader_pool(CurrentMetrics::ObjectStorageAzureThreads, CurrentMetrics::ObjectStorageAzureThreadsActive, 1) - , create_reader_scheduler(threadPoolCallbackRunner(create_reader_pool, "CreateAzureReader")) + , create_reader_scheduler(threadPoolCallbackRunner(create_reader_pool, "AzureReader")) { reader = createReader(); if (reader) From 5b3cece42eb540f5b05d04faa7fe3de7cd1ccb86 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 6 Jun 2023 12:23:00 +0200 Subject: [PATCH 0520/1072] Fix shellcheck --- docker/test/stateless/run.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index df650b37cc6..c0acb0291a4 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -15,6 +15,7 @@ dpkg -i package_folder/clickhouse-client_*.deb ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test +# shellcheck disable=SC1091 source /usr/share/clickhouse-test/ci/attach_gdb.lib # install test configs From 8028184e301f58aab5c4674226abbb39c5b8b745 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 6 Jun 2023 12:27:40 +0200 Subject: [PATCH 0521/1072] Fix read --- src/Storages/StorageAzure.cpp | 41 ++----------------- src/Storages/StorageAzure.h | 1 - .../test_storage_azure_blob_storage/test.py | 13 ++++++ 3 files changed, 16 insertions(+), 39 deletions(-) diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index 1e128bfed66..2f0029947f5 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -1096,6 +1096,7 @@ StorageAzureSource::ReaderHolder StorageAzureSource::createReader() return {}; size_t object_size = info.size_bytes != 0 ? info.size_bytes : object_storage->getObjectMetadata(current_key).size_bytes; + LOG_DEBUG(log, "SIZE {}", object_size); auto compression_method = chooseCompressionMethod(current_key, compression_hint); auto read_buf = createAzureReadBuffer(current_key, object_size); @@ -1104,6 +1105,7 @@ StorageAzureSource::ReaderHolder StorageAzureSource::createReader() format_settings, std::nullopt, std::nullopt, /* is_remote_fs */ true, compression_method); + LOG_DEBUG(log, "FORMAT {}", format); QueryPipelineBuilder builder; builder.init(Pipe(input_format)); @@ -1138,50 +1140,13 @@ std::unique_ptr StorageAzureSource::createAzureReadBuffer(const Stri if (object_too_small && read_settings.remote_fs_method == RemoteFSReadMethod::threadpool) { LOG_TRACE(log, "Downloading object of size {} from S3 with initial prefetch", object_size); - return createAsyncAzureReadBuffer(key, read_settings, object_size); + return object_storage->readObjects({StoredObject(key)}, read_settings, {}, object_size); } return object_storage->readObject(StoredObject(key), read_settings, {}, object_size); } -std::unique_ptr StorageAzureSource::createAsyncAzureReadBuffer( - const String & key, const ReadSettings & read_settings, size_t object_size) -{ - auto context = getContext(); - auto read_buffer_creator = - [this, read_settings, object_size] - (const std::string & path, size_t read_until_position) -> std::unique_ptr - { - auto buffer = object_storage->readObject(StoredObject(path), read_settings, {}, object_size); - buffer->setReadUntilPosition(read_until_position); - return buffer; - }; - - auto s3_impl = std::make_unique( - std::move(read_buffer_creator), - StoredObjects{StoredObject{key, object_size}}, - read_settings, - /* cache_log */nullptr); - - auto modified_settings{read_settings}; - /// FIXME: Changing this setting to default value breaks something around parquet reading - modified_settings.remote_read_min_bytes_for_seek = modified_settings.remote_fs_buffer_size; - - auto & pool_reader = context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); - - auto async_reader = std::make_unique( - std::move(s3_impl), pool_reader, modified_settings, - context->getAsyncReadCounters(), context->getFilesystemReadPrefetchesLog()); - - async_reader->setReadUntilEnd(); - if (read_settings.remote_fs_prefetch) - async_reader->prefetch(DEFAULT_PREFETCH_PRIORITY); - - return async_reader; -} - - } #endif diff --git a/src/Storages/StorageAzure.h b/src/Storages/StorageAzure.h index 168015cf5d9..f114184c336 100644 --- a/src/Storages/StorageAzure.h +++ b/src/Storages/StorageAzure.h @@ -274,7 +274,6 @@ private: std::future createReaderAsync(); std::unique_ptr createAzureReadBuffer(const String & key, size_t object_size); - std::unique_ptr createAsyncAzureReadBuffer(const String & key, const ReadSettings & read_settings, size_t object_size); }; } diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 6a3c915cdc5..8fa87c4e286 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -188,3 +188,16 @@ def test_truncate(cluster): azure_query(node, "TRUNCATE TABLE test_truncate") with pytest.raises(Exception): print(get_azure_file_content("test_truncate.csv")) + + +def test_simple_read_write(cluster): + node = cluster.instances["node"] + azure_query( + node, + "CREATE TABLE test_simple_read_write (key UInt64, data String) Engine = Azure(azure_conf2, container='cont', blob_path='test_simple_read_write.csv', format='CSV')", + ) + + azure_query(node, "INSERT INTO test_simple_read_write VALUES (1, 'a')") + assert get_azure_file_content("test_simple_read_write.csv") == '1,"a"\n' + + print(azure_query(node, "SELECT * FROM test_simple_read_write")) From 3d99abee436b95fd16d8d4011283d12fb9aebb5f Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 6 Jun 2023 12:40:28 +0200 Subject: [PATCH 0522/1072] Remove async reads --- src/Storages/StorageAzure.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index 2f0029947f5..804476a1842 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -1131,17 +1131,17 @@ std::unique_ptr StorageAzureSource::createAzureReadBuffer(const Stri { auto read_settings = getContext()->getReadSettings().adjustBufferSize(object_size); read_settings.enable_filesystem_cache = false; - auto download_buffer_size = getContext()->getSettings().max_download_buffer_size; - const bool object_too_small = object_size <= 2 * download_buffer_size; + //auto download_buffer_size = getContext()->getSettings().max_download_buffer_size; + //const bool object_too_small = object_size <= 2 * download_buffer_size; // Create a read buffer that will prefetch the first ~1 MB of the file. // When reading lots of tiny files, this prefetching almost doubles the throughput. // For bigger files, parallel reading is more useful. - if (object_too_small && read_settings.remote_fs_method == RemoteFSReadMethod::threadpool) - { - LOG_TRACE(log, "Downloading object of size {} from S3 with initial prefetch", object_size); - return object_storage->readObjects({StoredObject(key)}, read_settings, {}, object_size); - } + //if (object_too_small && read_settings.remote_fs_method == RemoteFSReadMethod::threadpool) + //{ + // LOG_TRACE(log, "Downloading object {} of size {} from S3 with initial prefetch", key, object_size); + // return object_storage->readObjects({StoredObject(key)}, read_settings, {}, object_size); + //} return object_storage->readObject(StoredObject(key), read_settings, {}, object_size); } From 8d044e8880a588a7854a6f6e2513226a6335ff48 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 6 Jun 2023 10:57:13 +0000 Subject: [PATCH 0523/1072] Increase max array size in group bitmap --- src/AggregateFunctions/AggregateFunctionGroupBitmapData.h | 2 +- tests/queries/0_stateless/02782_bitmap_overflow.reference | 0 tests/queries/0_stateless/02782_bitmap_overflow.sql | 2 ++ 3 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02782_bitmap_overflow.reference create mode 100644 tests/queries/0_stateless/02782_bitmap_overflow.sql diff --git a/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h b/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h index d99f0bf16ee..7ea1ebe7749 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h +++ b/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h @@ -122,7 +122,7 @@ public: size_t size; readVarUInt(size, in); - static constexpr size_t max_size = 1_GiB; + static constexpr size_t max_size = 100_GiB; if (size == 0) throw Exception(ErrorCodes::INCORRECT_DATA, "Incorrect size (0) in groupBitmap."); diff --git a/tests/queries/0_stateless/02782_bitmap_overflow.reference b/tests/queries/0_stateless/02782_bitmap_overflow.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02782_bitmap_overflow.sql b/tests/queries/0_stateless/02782_bitmap_overflow.sql new file mode 100644 index 00000000000..656a3e7c144 --- /dev/null +++ b/tests/queries/0_stateless/02782_bitmap_overflow.sql @@ -0,0 +1,2 @@ +select unhex('0181808080908380808000')::AggregateFunction(groupBitmap, UInt64); -- {serverError TOO_LARGE_ARRAY_SIZE} + From d497562a07b64434dc0c6b892e666776c51fa693 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 6 Jun 2023 13:01:21 +0200 Subject: [PATCH 0524/1072] Copy and paste --- .../IO/ReadBufferFromAzureBlobStorage.cpp | 60 +++++++++++++++++-- src/Disks/IO/ReadBufferFromAzureBlobStorage.h | 9 +++ src/Storages/StorageAzure.cpp | 2 +- .../test_storage_azure_blob_storage/test.py | 24 +++++++- 4 files changed, 89 insertions(+), 6 deletions(-) diff --git a/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp b/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp index 0f197c2ff06..a086eb0a6df 100644 --- a/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp +++ b/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp @@ -35,6 +35,7 @@ ReadBufferFromAzureBlobStorage::ReadBufferFromAzureBlobStorage( size_t max_single_read_retries_, size_t max_single_download_retries_, bool use_external_buffer_, + bool restricted_seek_, size_t read_until_position_) : ReadBufferFromFileBase(use_external_buffer_ ? 0 : read_settings_.remote_fs_buffer_size, nullptr, 0) , blob_container_client(blob_container_client_) @@ -44,6 +45,7 @@ ReadBufferFromAzureBlobStorage::ReadBufferFromAzureBlobStorage( , read_settings(read_settings_) , tmp_buffer_size(read_settings.remote_fs_buffer_size) , use_external_buffer(use_external_buffer_) + , restricted_seek(restricted_seek_) , read_until_position(read_until_position_) { if (!use_external_buffer) @@ -118,8 +120,17 @@ bool ReadBufferFromAzureBlobStorage::nextImpl() off_t ReadBufferFromAzureBlobStorage::seek(off_t offset_, int whence) { - if (initialized) - throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Seek is allowed only before first read attempt from the buffer."); + if (offset_ == getPosition() && whence == SEEK_SET) + return offset_; + + if (initialized && restricted_seek) + { + throw Exception( + ErrorCodes::CANNOT_SEEK_THROUGH_FILE, + "Seek is allowed only before first read attempt from the buffer (current offset: " + "{}, new offset: {}, reading until position: {}, available: {})", + getPosition(), offset_, read_until_position, available()); + } if (whence != SEEK_SET) throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET mode is allowed."); @@ -127,8 +138,36 @@ off_t ReadBufferFromAzureBlobStorage::seek(off_t offset_, int whence) if (offset_ < 0) throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bounds. Offset: {}", offset_); - offset = offset_; + if (!restricted_seek) + { + if (!working_buffer.empty() + && static_cast(offset_) >= offset - working_buffer.size() + && offset_ < offset) + { + pos = working_buffer.end() - (offset - offset_); + assert(pos >= working_buffer.begin()); + assert(pos < working_buffer.end()); + return getPosition(); + } + + off_t position = getPosition(); + if (initialized && offset_ > position) + { + size_t diff = offset_ - position; + if (diff < read_settings.remote_read_min_bytes_for_seek) + { + ignore(diff); + return offset_; + } + } + + resetWorkingBuffer(); + if (initialized) + initialized = false; + } + + offset = offset_; return offset; } @@ -152,7 +191,8 @@ void ReadBufferFromAzureBlobStorage::initialize() download_options.Range = {static_cast(offset), length}; - blob_client = std::make_unique(blob_container_client->GetBlobClient(path)); + if (!blob_client) + blob_client = std::make_unique(blob_container_client->GetBlobClient(path)); size_t sleep_time_with_backoff_milliseconds = 100; for (size_t i = 0; i < max_single_download_retries; ++i) @@ -182,6 +222,18 @@ void ReadBufferFromAzureBlobStorage::initialize() initialized = true; } +size_t ReadBufferFromAzureBlobStorage::getFileSize() +{ + if (!blob_client) + blob_client = std::make_unique(blob_container_client->GetBlobClient(path)); + + if (file_size.has_value()) + return *file_size; + + file_size = blob_client->GetProperties().Value.BlobSize; + return *file_size; +} + } #endif diff --git a/src/Disks/IO/ReadBufferFromAzureBlobStorage.h b/src/Disks/IO/ReadBufferFromAzureBlobStorage.h index 6164a005773..599ecba1dd1 100644 --- a/src/Disks/IO/ReadBufferFromAzureBlobStorage.h +++ b/src/Disks/IO/ReadBufferFromAzureBlobStorage.h @@ -24,6 +24,7 @@ public: size_t max_single_read_retries_, size_t max_single_download_retries_, bool use_external_buffer_ = false, + bool restricted_seek_ = false, size_t read_until_position_ = 0); off_t seek(off_t off, int whence) override; @@ -40,6 +41,8 @@ public: bool supportsRightBoundedReads() const override { return true; } + size_t getFileSize() override; + private: void initialize(); @@ -55,6 +58,12 @@ private: std::vector tmp_buffer; size_t tmp_buffer_size; bool use_external_buffer; + + /// There is different seek policy for disk seek and for non-disk seek + /// (non-disk seek is applied for seekable input formats: orc, arrow, parquet). + bool restricted_seek; + + off_t read_until_position = 0; off_t offset = 0; diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index 804476a1842..d83dc90e6ed 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -453,7 +453,7 @@ void StorageAzure::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextP for (const auto & key : configuration.blobs_paths) objects.emplace_back(key); - object_storage->removeObjects(objects); + object_storage->removeObjectsIfExist(objects); } namespace diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 8fa87c4e286..bcf5d068057 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -199,5 +199,27 @@ def test_simple_read_write(cluster): azure_query(node, "INSERT INTO test_simple_read_write VALUES (1, 'a')") assert get_azure_file_content("test_simple_read_write.csv") == '1,"a"\n' - print(azure_query(node, "SELECT * FROM test_simple_read_write")) + assert azure_query(node, "SELECT * FROM test_simple_read_write") == "1\ta\n" + + +def test_create_new_files_on_insert(cluster): + + node = cluster.instances["node"] + + azure_query(node, f"create table test_multiple_inserts(a Int32, b String) ENGINE = Azure(azure_conf2, container='cont', blob_path='test_parquet', format='Parquet')") + azure_query(node, "truncate table test_multiple_inserts") + azure_query(node, + f"insert into test_multiple_inserts select number, randomString(100) from numbers(10) settings azure_truncate_on_insert=1" + ) + azure_query(node, + f"insert into test_multiple_inserts select number, randomString(100) from numbers(20) settings azure_create_new_file_on_insert=1" + ) + azure_query(node, + f"insert into test_multiple_inserts select number, randomString(100) from numbers(30) settings azure_create_new_file_on_insert=1" + ) + + result = azure_query(node, f"select count() from test_multiple_inserts") + assert int(result) == 60 + + azure_query(node, f"drop table test_multiple_inserts") From cf886d8ced474a1a80a985587af251a39701d1b2 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 6 Jun 2023 11:08:21 +0000 Subject: [PATCH 0525/1072] Remove IsConvertible() --- src/Functions/DateTimeTransforms.h | 46 +++++++++++++++-------------- src/Functions/FunctionsConversion.h | 41 ------------------------- 2 files changed, 24 insertions(+), 63 deletions(-) diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index d154dd9ffa2..823272e0324 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -1444,31 +1444,33 @@ struct Transformer for (size_t i = 0; i < size; ++i) { - constexpr bool transformHasIsConvertible = requires(const Transform& t) + if constexpr (std::is_same_v + || std::is_same_v) { - t.IsConvertible(vec_from[i], time_zone); - }; + bool check_range_result = true; - if constexpr (transformHasIsConvertible) - { - if constexpr (std::is_same_v - || std::is_same_v) + if constexpr (std::is_same_v) { - bool checked = transform.IsConvertible(vec_from[i], time_zone); - if (!checked) + check_range_result = vec_from[i] >= 0 && vec_from[i] <= DATE_LUT_MAX_DAY_NUM; + } + else if constexpr (std::is_same_v) + { + check_range_result = vec_from[i] >= 0 && vec_from[i] <= 0xFFFFFFFFL; + } + + if (!check_range_result) + { + if (std::is_same_v) { - if (std::is_same_v) - { - vec_to[i] = 0; - if (vec_null_map_to) - (*vec_null_map_to)[i] = true; - continue; - } - else - { - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Value in column {} cannot be safely converted into type {}", - TypeName, TypeName); - } + vec_to[i] = 0; + if (vec_null_map_to) + (*vec_null_map_to)[i] = true; + continue; + } + else + { + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Value in column {} cannot be safely converted into type {}", + TypeName, TypeName); } } } @@ -1488,7 +1490,7 @@ struct DateTimeTransformImpl static ColumnPtr execute( const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/, const Transform & transform = {}) { - using Op = Transformer; + using Op = Transformer; const ColumnPtr source_col = arguments[0].column; if (const auto * sources = checkAndGetColumn(source_col.get())) diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 6aa5843ff65..3a8ddcc9094 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -365,22 +365,11 @@ template struct ConvertImpl -static bool CheckDateRange(const FromType & value) -{ - return value >= 0 && value <= DATE_LUT_MAX_DAY_NUM; -} - template struct ToDateTransform32Or64 { static constexpr auto name = "toDate"; - static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) - { - return CheckDateRange(from); - } - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) { // since converting to Date, no need in values outside of default LUT range. @@ -395,11 +384,6 @@ struct ToDateTransform32Or64Signed { static constexpr auto name = "toDate"; - static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) - { - return CheckDateRange(from); - } - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) { // TODO: decide narrow or extended range based on FromType @@ -417,11 +401,6 @@ struct ToDateTransform8Or16Signed { static constexpr auto name = "toDate"; - static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) - { - return CheckDateRange(from); - } - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) { if (from < 0) @@ -518,22 +497,12 @@ template struct ConvertImpl struct ConvertImpl : DateTimeTransformImpl> {}; -template -static bool CheckDateTimeRange(const FromType & value) -{ - return value >= 0 && value <= 0xFFFFFFFFL; -} template struct ToDateTimeTransform64 { static constexpr auto name = "toDateTime"; - static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) - { - return CheckDateTimeRange(from); - } - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) { return static_cast(std::min(time_t(from), time_t(0xFFFFFFFF))); @@ -545,11 +514,6 @@ struct ToDateTimeTransformSigned { static constexpr auto name = "toDateTime"; - static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) - { - return CheckDateTimeRange(from); - } - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) { if (from < 0) @@ -563,11 +527,6 @@ struct ToDateTimeTransform64Signed { static constexpr auto name = "toDateTime"; - static NO_SANITIZE_UNDEFINED bool IsConvertible(const FromType & from, const DateLUTImpl &) - { - return CheckDateTimeRange(from); - } - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) { if (from < 0) From 18decb090ca6f85826dcddde2d61b53fa460ee4c Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 6 Jun 2023 11:11:50 +0000 Subject: [PATCH 0526/1072] Automatic style fix --- .../test_storage_azure_blob_storage/test.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index bcf5d068057..7e66b6fb198 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -204,19 +204,24 @@ def test_simple_read_write(cluster): def test_create_new_files_on_insert(cluster): - node = cluster.instances["node"] - azure_query(node, f"create table test_multiple_inserts(a Int32, b String) ENGINE = Azure(azure_conf2, container='cont', blob_path='test_parquet', format='Parquet')") + azure_query( + node, + f"create table test_multiple_inserts(a Int32, b String) ENGINE = Azure(azure_conf2, container='cont', blob_path='test_parquet', format='Parquet')", + ) azure_query(node, "truncate table test_multiple_inserts") - azure_query(node, - f"insert into test_multiple_inserts select number, randomString(100) from numbers(10) settings azure_truncate_on_insert=1" + azure_query( + node, + f"insert into test_multiple_inserts select number, randomString(100) from numbers(10) settings azure_truncate_on_insert=1", ) - azure_query(node, - f"insert into test_multiple_inserts select number, randomString(100) from numbers(20) settings azure_create_new_file_on_insert=1" + azure_query( + node, + f"insert into test_multiple_inserts select number, randomString(100) from numbers(20) settings azure_create_new_file_on_insert=1", ) - azure_query(node, - f"insert into test_multiple_inserts select number, randomString(100) from numbers(30) settings azure_create_new_file_on_insert=1" + azure_query( + node, + f"insert into test_multiple_inserts select number, randomString(100) from numbers(30) settings azure_create_new_file_on_insert=1", ) result = azure_query(node, f"select count() from test_multiple_inserts") From 6b41a02f7fbdd46ee2774651fea3518d26844407 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Tue, 6 Jun 2023 13:14:52 +0200 Subject: [PATCH 0527/1072] resolve tests --- src/Client/Suggest.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Client/Suggest.cpp b/src/Client/Suggest.cpp index 4ffa828dd40..e249aa1bb04 100644 --- a/src/Client/Suggest.cpp +++ b/src/Client/Suggest.cpp @@ -101,7 +101,9 @@ static String getLoadSuggestionQuery(Int32 suggestion_limit, bool basic_suggesti add_column("name", "columns", true, suggestion_limit); } - query = "SELECT DISTINCT arrayJoin(extractAll(name, '[\\\\w_]{2,}')) AS res FROM (" + query + ") WHERE notEmpty(res)"; + /// FIXME: Forbid this query using new analyzer because of bug https://github.com/ClickHouse/ClickHouse/pull/50430#issuecomment-1576860893 + /// We should remove this restriction after resolving this bug. + query = "SELECT DISTINCT arrayJoin(extractAll(name, '[\\\\w_]{2,}')) AS res FROM (" + query + ") WHERE notEmpty(res) SETTINGS allow_experimental_analyzer=0"; return query; } From e87348010d3e77d60b8ccd85e7bd4574bec9600b Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Tue, 6 Jun 2023 14:42:56 +0200 Subject: [PATCH 0528/1072] Rework loading and removing of data parts for MergeTree tables. (#49474) Co-authored-by: Sergei Trifonov --- programs/local/LocalServer.cpp | 24 +- programs/server/Server.cpp | 53 +++- src/Backups/BackupIO_S3.cpp | 8 +- src/Common/CurrentMetrics.cpp | 2 + src/Core/ServerSettings.h | 4 +- src/Formats/FormatFactory.cpp | 2 +- src/IO/SharedThreadPools.cpp | 151 ++++++---- src/IO/SharedThreadPools.h | 74 +++-- src/Interpreters/threadPoolCallbackRunner.h | 3 + src/Storages/MergeTree/MergeTreeData.cpp | 284 +++++++----------- src/Storages/MergeTree/MergeTreeData.h | 6 +- src/Storages/MergeTree/MergeTreeSettings.h | 5 +- src/Storages/MergeTree/MergeTreeSource.cpp | 2 +- src/Storages/StorageS3.cpp | 2 +- .../System/StorageSystemDetachedParts.cpp | 2 +- .../00988_parallel_parts_removal.sql | 2 +- .../00989_parallel_parts_loading.sql | 2 +- .../01810_max_part_removal_threads_long.sh | 11 +- .../02432_s3_parallel_parts_cleanup.sql | 5 +- 19 files changed, 352 insertions(+), 290 deletions(-) diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 96c1ca261b5..caca7cfb50d 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -130,15 +130,31 @@ void LocalServer::initialize(Poco::Util::Application & self) }); #endif - IOThreadPool::initialize( + getIOThreadPool().initialize( config().getUInt("max_io_thread_pool_size", 100), config().getUInt("max_io_thread_pool_free_size", 0), config().getUInt("io_thread_pool_queue_size", 10000)); - OutdatedPartsLoadingThreadPool::initialize( - config().getUInt("max_outdated_parts_loading_thread_pool_size", 16), + + const size_t active_parts_loading_threads = config().getUInt("max_active_parts_loading_thread_pool_size", 64); + getActivePartsLoadingThreadPool().initialize( + active_parts_loading_threads, 0, // We don't need any threads one all the parts will be loaded - config().getUInt("max_outdated_parts_loading_thread_pool_size", 16)); + active_parts_loading_threads); + + const size_t outdated_parts_loading_threads = config().getUInt("max_outdated_parts_loading_thread_pool_size", 32); + getOutdatedPartsLoadingThreadPool().initialize( + outdated_parts_loading_threads, + 0, // We don't need any threads one all the parts will be loaded + outdated_parts_loading_threads); + + getOutdatedPartsLoadingThreadPool().setMaxTurboThreads(active_parts_loading_threads); + + const size_t cleanup_threads = config().getUInt("max_parts_cleaning_thread_pool_size", 128); + getPartsCleaningThreadPool().initialize( + cleanup_threads, + 0, // We don't need any threads one all the parts will be deleted + cleanup_threads); } diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 9eb3e6c9ebc..d0fc8aca5e8 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -683,21 +683,36 @@ try }); #endif - IOThreadPool::initialize( + getIOThreadPool().initialize( server_settings.max_io_thread_pool_size, server_settings.max_io_thread_pool_free_size, server_settings.io_thread_pool_queue_size); - BackupsIOThreadPool::initialize( + getBackupsIOThreadPool().initialize( server_settings.max_backups_io_thread_pool_size, server_settings.max_backups_io_thread_pool_free_size, server_settings.backups_io_thread_pool_queue_size); - OutdatedPartsLoadingThreadPool::initialize( + getActivePartsLoadingThreadPool().initialize( + server_settings.max_active_parts_loading_thread_pool_size, + 0, // We don't need any threads once all the parts will be loaded + server_settings.max_active_parts_loading_thread_pool_size); + + getOutdatedPartsLoadingThreadPool().initialize( server_settings.max_outdated_parts_loading_thread_pool_size, - 0, // We don't need any threads one all the parts will be loaded + 0, // We don't need any threads once all the parts will be loaded server_settings.max_outdated_parts_loading_thread_pool_size); + /// It could grow if we need to synchronously wait until all the data parts will be loaded. + getOutdatedPartsLoadingThreadPool().setMaxTurboThreads( + server_settings.max_active_parts_loading_thread_pool_size + ); + + getPartsCleaningThreadPool().initialize( + server_settings.max_parts_cleaning_thread_pool_size, + 0, // We don't need any threads one all the parts will be deleted + server_settings.max_parts_cleaning_thread_pool_size); + /// Initialize global local cache for remote filesystem. if (config().has("local_cache_for_remote_fs")) { @@ -1226,6 +1241,36 @@ try global_context->getMessageBrokerSchedulePool().increaseThreadsCount(server_settings_.background_message_broker_schedule_pool_size); global_context->getDistributedSchedulePool().increaseThreadsCount(server_settings_.background_distributed_schedule_pool_size); + getIOThreadPool().reloadConfiguration( + server_settings.max_io_thread_pool_size, + server_settings.max_io_thread_pool_free_size, + server_settings.io_thread_pool_queue_size); + + getBackupsIOThreadPool().reloadConfiguration( + server_settings.max_backups_io_thread_pool_size, + server_settings.max_backups_io_thread_pool_free_size, + server_settings.backups_io_thread_pool_queue_size); + + getActivePartsLoadingThreadPool().reloadConfiguration( + server_settings.max_active_parts_loading_thread_pool_size, + 0, // We don't need any threads once all the parts will be loaded + server_settings.max_active_parts_loading_thread_pool_size); + + getOutdatedPartsLoadingThreadPool().reloadConfiguration( + server_settings.max_outdated_parts_loading_thread_pool_size, + 0, // We don't need any threads once all the parts will be loaded + server_settings.max_outdated_parts_loading_thread_pool_size); + + /// It could grow if we need to synchronously wait until all the data parts will be loaded. + getOutdatedPartsLoadingThreadPool().setMaxTurboThreads( + server_settings.max_active_parts_loading_thread_pool_size + ); + + getPartsCleaningThreadPool().reloadConfiguration( + server_settings.max_parts_cleaning_thread_pool_size, + 0, // We don't need any threads one all the parts will be deleted + server_settings.max_parts_cleaning_thread_pool_size); + if (config->has("resources")) { global_context->getResourceManager()->updateConfiguration(*config); diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index f1fd276e34b..967beba4bf5 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -161,7 +161,7 @@ void BackupReaderS3::copyFileToDisk(const String & path_in_backup, size_t file_s /* dest_key= */ blob_path[0], request_settings, object_attributes, - threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupReaderS3"), + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupReaderS3"), /* for_disk_s3= */ true); return file_size; @@ -212,7 +212,7 @@ void BackupWriterS3::copyFileFromDisk(const String & path_in_backup, DiskPtr src fs::path(s3_uri.key) / path_in_backup, request_settings, {}, - threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupWriterS3")); + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterS3")); return; /// copied! } } @@ -224,7 +224,7 @@ void BackupWriterS3::copyFileFromDisk(const String & path_in_backup, DiskPtr src void BackupWriterS3::copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) { copyDataToS3File(create_read_buffer, start_pos, length, client, s3_uri.bucket, fs::path(s3_uri.key) / path_in_backup, request_settings, {}, - threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupWriterS3")); + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterS3")); } BackupWriterS3::~BackupWriterS3() = default; @@ -258,7 +258,7 @@ std::unique_ptr BackupWriterS3::writeFile(const String & file_name) DBMS_DEFAULT_BUFFER_SIZE, request_settings, std::nullopt, - threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupWriterS3"), + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterS3"), write_settings); } diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 956487a300e..61725d079bf 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -137,6 +137,8 @@ M(ObjectStorageAzureThreadsActive, "Number of threads in the AzureObjectStorage thread pool running a task.") \ M(MergeTreePartsLoaderThreads, "Number of threads in the MergeTree parts loader thread pool.") \ M(MergeTreePartsLoaderThreadsActive, "Number of threads in the MergeTree parts loader thread pool running a task.") \ + M(MergeTreeOutdatedPartsLoaderThreads, "Number of threads in the threadpool for loading Outdated data parts.") \ + M(MergeTreeOutdatedPartsLoaderThreadsActive, "Number of active threads in the threadpool for loading Outdated data parts.") \ M(MergeTreePartsCleanerThreads, "Number of threads in the MergeTree parts cleaner thread pool.") \ M(MergeTreePartsCleanerThreadsActive, "Number of threads in the MergeTree parts cleaner thread pool running a task.") \ M(SystemReplicasThreads, "Number of threads in the system.replicas thread pool.") \ diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index cb43d62ecd1..1a9f226041b 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -21,7 +21,9 @@ namespace DB M(UInt64, max_io_thread_pool_size, 100, "The maximum number of threads that would be used for IO operations", 0) \ M(UInt64, max_io_thread_pool_free_size, 0, "Max free size for IO thread pool.", 0) \ M(UInt64, io_thread_pool_queue_size, 10000, "Queue size for IO thread pool.", 0) \ - M(UInt64, max_outdated_parts_loading_thread_pool_size, 32, "The maximum number of threads that would be used for loading outdated data parts on startup", 0) \ + M(UInt64, max_active_parts_loading_thread_pool_size, 64, "The number of threads to load active set of data parts (Active ones) at startup.", 0) \ + M(UInt64, max_outdated_parts_loading_thread_pool_size, 32, "The number of threads to load inactive set of data parts (Outdated ones) at startup.", 0) \ + M(UInt64, max_parts_cleaning_thread_pool_size, 128, "The number of threads for concurrent removal of inactive data parts.", 0) \ M(UInt64, max_replicated_fetches_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for replicated fetches. Zero means unlimited.", 0) \ M(UInt64, max_replicated_sends_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for replicated sends. Zero means unlimited.", 0) \ M(UInt64, max_remote_read_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for read. Zero means unlimited.", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 6f2974c49c6..39b28e025a6 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -364,7 +364,7 @@ std::unique_ptr FormatFactory::wrapReadBufferIfNeeded( settings.max_download_buffer_size); res = wrapInParallelReadBufferIfSupported( - buf, threadPoolCallbackRunner(IOThreadPool::get(), "ParallelRead"), + buf, threadPoolCallbackRunner(getIOThreadPool().get(), "ParallelRead"), max_download_threads, settings.max_download_buffer_size, file_size); } diff --git a/src/IO/SharedThreadPools.cpp b/src/IO/SharedThreadPools.cpp index b7b6aea1567..6a0e953f0ef 100644 --- a/src/IO/SharedThreadPools.cpp +++ b/src/IO/SharedThreadPools.cpp @@ -9,8 +9,12 @@ namespace CurrentMetrics extern const Metric IOThreadsActive; extern const Metric BackupsIOThreads; extern const Metric BackupsIOThreadsActive; - extern const Metric OutdatedPartsLoadingThreads; - extern const Metric OutdatedPartsLoadingThreadsActive; + extern const Metric MergeTreePartsLoaderThreads; + extern const Metric MergeTreePartsLoaderThreadsActive; + extern const Metric MergeTreePartsCleanerThreads; + extern const Metric MergeTreePartsCleanerThreadsActive; + extern const Metric MergeTreeOutdatedPartsLoaderThreads; + extern const Metric MergeTreeOutdatedPartsLoaderThreadsActive; } namespace DB @@ -21,88 +25,117 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -std::unique_ptr IOThreadPool::instance; -void IOThreadPool::initialize(size_t max_threads, size_t max_free_threads, size_t queue_size) +StaticThreadPool::StaticThreadPool( + const String & name_, + CurrentMetrics::Metric threads_metric_, + CurrentMetrics::Metric threads_active_metric_) + : name(name_) + , threads_metric(threads_metric_) + , threads_active_metric(threads_active_metric_) +{ +} + +void StaticThreadPool::initialize(size_t max_threads, size_t max_free_threads, size_t queue_size) { if (instance) - { - throw Exception(ErrorCodes::LOGICAL_ERROR, "The IO thread pool is initialized twice"); - } + throw Exception(ErrorCodes::LOGICAL_ERROR, "The {} is initialized twice", name); + /// By default enabling "turbo mode" won't affect the number of threads anyhow + max_threads_turbo = max_threads; + max_threads_normal = max_threads; instance = std::make_unique( - CurrentMetrics::IOThreads, - CurrentMetrics::IOThreadsActive, + threads_metric, + threads_active_metric, max_threads, max_free_threads, queue_size, /* shutdown_on_exception= */ false); } -ThreadPool & IOThreadPool::get() +void StaticThreadPool::reloadConfiguration(size_t max_threads, size_t max_free_threads, size_t queue_size) { if (!instance) - { - throw Exception(ErrorCodes::LOGICAL_ERROR, "The IO thread pool is not initialized"); - } + throw Exception(ErrorCodes::LOGICAL_ERROR, "The {} is not initialized", name); + + instance->setMaxThreads(turbo_mode_enabled > 0 ? max_threads_turbo : max_threads); + instance->setMaxFreeThreads(max_free_threads); + instance->setQueueSize(queue_size); +} + + +ThreadPool & StaticThreadPool::get() +{ + if (!instance) + throw Exception(ErrorCodes::LOGICAL_ERROR, "The {} is not initialized", name); return *instance; } -std::unique_ptr BackupsIOThreadPool::instance; - -void BackupsIOThreadPool::initialize(size_t max_threads, size_t max_free_threads, size_t queue_size) -{ - if (instance) - { - throw Exception(ErrorCodes::LOGICAL_ERROR, "The BackupsIO thread pool is initialized twice"); - } - - instance = std::make_unique( - CurrentMetrics::BackupsIOThreads, - CurrentMetrics::BackupsIOThreadsActive, - max_threads, - max_free_threads, - queue_size, - /* shutdown_on_exception= */ false); -} - -ThreadPool & BackupsIOThreadPool::get() +void StaticThreadPool::enableTurboMode() { if (!instance) - { - throw Exception(ErrorCodes::LOGICAL_ERROR, "The BackupsIO thread pool is not initialized"); - } + throw Exception(ErrorCodes::LOGICAL_ERROR, "The {} is not initialized", name); - return *instance; + std::lock_guard lock(mutex); + + ++turbo_mode_enabled; + if (turbo_mode_enabled == 1) + instance->setMaxThreads(max_threads_turbo); } -std::unique_ptr OutdatedPartsLoadingThreadPool::instance; - -void OutdatedPartsLoadingThreadPool::initialize(size_t max_threads, size_t max_free_threads, size_t queue_size) -{ - if (instance) - { - throw Exception(ErrorCodes::LOGICAL_ERROR, "The PartsLoadingThreadPool thread pool is initialized twice"); - } - - instance = std::make_unique( - CurrentMetrics::OutdatedPartsLoadingThreads, - CurrentMetrics::OutdatedPartsLoadingThreadsActive, - max_threads, - max_free_threads, - queue_size, - /* shutdown_on_exception= */ false); -} - -ThreadPool & OutdatedPartsLoadingThreadPool::get() +void StaticThreadPool::disableTurboMode() { if (!instance) - { - throw Exception(ErrorCodes::LOGICAL_ERROR, "The PartsLoadingThreadPool thread pool is not initialized"); - } + throw Exception(ErrorCodes::LOGICAL_ERROR, "The {} is not initialized", name); - return *instance; + std::lock_guard lock(mutex); + + --turbo_mode_enabled; + if (turbo_mode_enabled == 0) + instance->setMaxThreads(max_threads_normal); +} + +void StaticThreadPool::setMaxTurboThreads(size_t max_threads_turbo_) +{ + if (!instance) + throw Exception(ErrorCodes::LOGICAL_ERROR, "The {} is not initialized", name); + + std::lock_guard lock(mutex); + + max_threads_turbo = max_threads_turbo_; + if (turbo_mode_enabled > 0) + instance->setMaxThreads(max_threads_turbo); +} + +StaticThreadPool & getIOThreadPool() +{ + static StaticThreadPool instance("IOThreadPool", CurrentMetrics::IOThreads, CurrentMetrics::IOThreadsActive); + return instance; +} + +StaticThreadPool & getBackupsIOThreadPool() +{ + static StaticThreadPool instance("BackupsIOThreadPool", CurrentMetrics::BackupsIOThreads, CurrentMetrics::BackupsIOThreadsActive); + return instance; +} + +StaticThreadPool & getActivePartsLoadingThreadPool() +{ + static StaticThreadPool instance("MergeTreePartsLoaderThreadPool", CurrentMetrics::MergeTreePartsLoaderThreads, CurrentMetrics::MergeTreePartsLoaderThreadsActive); + return instance; +} + +StaticThreadPool & getPartsCleaningThreadPool() +{ + static StaticThreadPool instance("MergeTreePartsCleanerThreadPool", CurrentMetrics::MergeTreePartsCleanerThreads, CurrentMetrics::MergeTreePartsCleanerThreadsActive); + return instance; +} + +StaticThreadPool & getOutdatedPartsLoadingThreadPool() +{ + static StaticThreadPool instance("MergeTreeOutdatedPartsLoaderThreadPool", CurrentMetrics::MergeTreeOutdatedPartsLoaderThreads, CurrentMetrics::MergeTreeOutdatedPartsLoaderThreadsActive); + return instance; } } diff --git a/src/IO/SharedThreadPools.h b/src/IO/SharedThreadPools.h index 1b43dfe778c..188a2a4f003 100644 --- a/src/IO/SharedThreadPools.h +++ b/src/IO/SharedThreadPools.h @@ -1,48 +1,64 @@ #pragma once +#include #include +#include + #include #include +#include namespace DB { -/* - * ThreadPool used for the IO. - */ -class IOThreadPool +class StaticThreadPool { - static std::unique_ptr instance; - public: - static void initialize(size_t max_threads, size_t max_free_threads, size_t queue_size); - static ThreadPool & get(); + StaticThreadPool( + const String & name_, + CurrentMetrics::Metric threads_metric_, + CurrentMetrics::Metric threads_active_metric_); + + ThreadPool & get(); + + void initialize(size_t max_threads, size_t max_free_threads, size_t queue_size); + void reloadConfiguration(size_t max_threads, size_t max_free_threads, size_t queue_size); + + /// At runtime we can increase the number of threads up the specified limit + /// This is needed to utilize as much a possible resources to accomplish some task. + void setMaxTurboThreads(size_t max_threads_turbo_); + void enableTurboMode(); + void disableTurboMode(); + +private: + const String name; + const CurrentMetrics::Metric threads_metric; + const CurrentMetrics::Metric threads_active_metric; + + std::unique_ptr instance; + std::mutex mutex; + size_t max_threads_turbo = 0; + size_t max_threads_normal = 0; + /// If this counter is > 0 - this specific mode is enabled + size_t turbo_mode_enabled = 0; }; +/// ThreadPool used for the IO. +StaticThreadPool & getIOThreadPool(); -/* - * ThreadPool used for the Backup IO. - */ -class BackupsIOThreadPool -{ - static std::unique_ptr instance; +/// ThreadPool used for the Backup IO. +StaticThreadPool & getBackupsIOThreadPool(); -public: - static void initialize(size_t max_threads, size_t max_free_threads, size_t queue_size); - static ThreadPool & get(); -}; +/// ThreadPool used for the loading of Outdated data parts for MergeTree tables. +StaticThreadPool & getActivePartsLoadingThreadPool(); +/// ThreadPool used for deleting data parts for MergeTree tables. +StaticThreadPool & getPartsCleaningThreadPool(); -/* - * ThreadPool used for the loading of Outdated data parts for MergeTree tables. - */ -class OutdatedPartsLoadingThreadPool -{ - static std::unique_ptr instance; - -public: - static void initialize(size_t max_threads, size_t max_free_threads, size_t queue_size); - static ThreadPool & get(); -}; +/// This ThreadPool is used for the loading of Outdated data parts for MergeTree tables. +/// Normally we will just load Outdated data parts concurrently in background, but in +/// case when we need to synchronously wait for the loading to be finished, we can increase +/// the number of threads by calling enableTurboMode() :-) +StaticThreadPool & getOutdatedPartsLoadingThreadPool(); } diff --git a/src/Interpreters/threadPoolCallbackRunner.h b/src/Interpreters/threadPoolCallbackRunner.h index f7324bfafe6..eb90b61cf31 100644 --- a/src/Interpreters/threadPoolCallbackRunner.h +++ b/src/Interpreters/threadPoolCallbackRunner.h @@ -44,6 +44,9 @@ ThreadPoolCallbackRunner threadPoolCallbackRunner(ThreadPool & auto future = task->get_future(); + /// ThreadPool is using "bigger is higher priority" instead of "smaller is more priority". + /// Note: calling method scheduleOrThrowOnError in intentional, because we don't want to throw exceptions + /// in critical places where this callback runner is used (e.g. loading or deletion of parts) my_pool->scheduleOrThrowOnError([my_task = std::move(task)]{ (*my_task)(); }, priority); return future; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 32665429051..e806e1bb93f 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -130,10 +130,6 @@ namespace ProfileEvents namespace CurrentMetrics { extern const Metric DelayedInserts; - extern const Metric MergeTreePartsLoaderThreads; - extern const Metric MergeTreePartsLoaderThreadsActive; - extern const Metric MergeTreePartsCleanerThreads; - extern const Metric MergeTreePartsCleanerThreadsActive; } @@ -1425,71 +1421,17 @@ MergeTreeData::LoadPartResult MergeTreeData::loadDataPartWithRetries( UNREACHABLE(); } -std::vector MergeTreeData::loadDataPartsFromDisk( - ThreadPool & pool, - size_t num_parts, - std::queue & parts_queue, - const MergeTreeSettingsPtr & settings) +std::vector MergeTreeData::loadDataPartsFromDisk(PartLoadingTreeNodes & parts_to_load) { - /// Parallel loading of data parts. - pool.setMaxThreads(std::min(static_cast(settings->max_part_loading_threads), num_parts)); - size_t num_threads = pool.getMaxThreads(); - LOG_DEBUG(log, "Going to use {} threads to load parts", num_threads); + const size_t num_parts = parts_to_load.size(); - std::vector parts_per_thread(num_threads, num_parts / num_threads); - for (size_t i = 0ul; i < num_parts % num_threads; ++i) - ++parts_per_thread[i]; + LOG_DEBUG(log, "Will load {} number of parts using {} threads", num_parts, getActivePartsLoadingThreadPool().get().getMaxThreads()); - /// Prepare data parts for parallel loading. Threads will focus on given disk first, then steal - /// others' tasks when finish current disk part loading process. - std::vector threads_parts(num_threads); - std::set remaining_thread_parts; - std::queue threads_queue; + /// Shuffle all the parts randomly to possible speed up loading them from JBOD. + std::shuffle(parts_to_load.begin(), parts_to_load.end(), thread_local_rng); - for (size_t i = 0; i < num_threads; ++i) - { - remaining_thread_parts.insert(i); - threads_queue.push(i); - } - - while (!parts_queue.empty()) - { - assert(!threads_queue.empty()); - size_t i = threads_queue.front(); - auto & need_parts = parts_per_thread[i]; - assert(need_parts > 0); - - auto & thread_parts = threads_parts[i]; - auto & current_parts = parts_queue.front(); - assert(!current_parts.empty()); - - auto parts_to_grab = std::min(need_parts, current_parts.size()); - thread_parts.insert(thread_parts.end(), current_parts.end() - parts_to_grab, current_parts.end()); - current_parts.resize(current_parts.size() - parts_to_grab); - need_parts -= parts_to_grab; - - /// Before processing next thread, change disk if possible. - /// Different threads will likely start loading parts from different disk, - /// which may improve read parallelism for JBOD. - - /// If current disk still has some parts, push it to the tail. - if (!current_parts.empty()) - parts_queue.push(std::move(current_parts)); - - parts_queue.pop(); - - /// If current thread still want some parts, push it to the tail. - if (need_parts > 0) - threads_queue.push(i); - - threads_queue.pop(); - } - - assert(threads_queue.empty()); - assert(std::all_of(threads_parts.begin(), threads_parts.end(), [](const auto & parts) - { - return !parts.empty(); - })); + auto runner = threadPoolCallbackRunner(getActivePartsLoadingThreadPool().get(), "ActiveParts"); + std::vector> parts_futures; std::mutex part_select_mutex; std::mutex part_loading_mutex; @@ -1498,81 +1440,77 @@ std::vector MergeTreeData::loadDataPartsFromDisk( try { - for (size_t thread = 0; thread < num_threads; ++thread) + while (true) { - pool.scheduleOrThrowOnError([&, thread, thread_group = CurrentThread::getGroup()] + bool are_parts_to_load_empty = false; { - SCOPE_EXIT_SAFE( - if (thread_group) - CurrentThread::detachFromGroupIfNotDetached(); - ); - if (thread_group) - CurrentThread::attachToGroupIfDetached(thread_group); + std::lock_guard lock(part_select_mutex); + are_parts_to_load_empty = parts_to_load.empty(); + } - while (true) + if (are_parts_to_load_empty) + { + /// Wait for all scheduled tasks. + /// We have to use .get() method to rethrow any exception that could occur. + for (auto & future: parts_futures) + future.get(); + parts_futures.clear(); + /// At this point it is possible, that some other parts appeared in the queue for processing (parts_to_load), + /// because we added them from inside the pool. + /// So we need to recheck it. + } + + PartLoadingTree::NodePtr current_part; + { + std::lock_guard lock(part_select_mutex); + if (parts_to_load.empty()) + break; + + current_part = parts_to_load.back(); + parts_to_load.pop_back(); + } + + parts_futures.push_back(runner( + [&, part = std::move(current_part)]() { - PartLoadingTree::NodePtr thread_part; - size_t thread_idx = thread; - - { - std::lock_guard lock{part_select_mutex}; - - if (remaining_thread_parts.empty()) - return; - - /// Steal task if nothing to do - if (threads_parts[thread].empty()) - { - // Try random steal tasks from the next thread - std::uniform_int_distribution distribution(0, remaining_thread_parts.size() - 1); - auto it = remaining_thread_parts.begin(); - std::advance(it, distribution(thread_local_rng)); - thread_idx = *it; - } - - auto & thread_parts = threads_parts[thread_idx]; - thread_part = thread_parts.back(); - thread_parts.pop_back(); - if (thread_parts.empty()) - remaining_thread_parts.erase(thread_idx); - } - /// Pass a separate mutex to guard the set of parts, because this lambda /// is called concurrently but with already locked @data_parts_mutex. auto res = loadDataPartWithRetries( - thread_part->info, thread_part->name, thread_part->disk, + part->info, part->name, part->disk, DataPartState::Active, part_loading_mutex, loading_parts_initial_backoff_ms, loading_parts_max_backoff_ms, loading_parts_max_tries); - thread_part->is_loaded = true; + part->is_loaded = true; bool is_active_part = res.part->getState() == DataPartState::Active; /// If part is broken or duplicate or should be removed according to transaction /// and it has any covered parts then try to load them to replace this part. - if (!is_active_part && !thread_part->children.empty()) + if (!is_active_part && !part->children.empty()) { std::lock_guard lock{part_select_mutex}; - for (const auto & [_, node] : thread_part->children) - threads_parts[thread].push_back(node); - remaining_thread_parts.insert(thread); + for (const auto & [_, node] : part->children) + parts_to_load.push_back(node); } { std::lock_guard lock(part_loading_mutex); loaded_parts.push_back(std::move(res)); } - } - }); + }, Priority{0})); } } catch (...) { - /// If this is not done, then in case of an exception, tasks will be destroyed before the threads are completed, and it will be bad. - pool.wait(); + /// Wait for all scheduled tasks + /// A future becomes invalid after .get() call + /// + .wait() method is used not to throw any exception here. + for (auto & future: parts_futures) + if (future.valid()) + future.wait(); + throw; } - pool.wait(); return loaded_parts; } @@ -1679,9 +1617,12 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) } } - ThreadPool pool(CurrentMetrics::MergeTreePartsLoaderThreads, CurrentMetrics::MergeTreePartsLoaderThreadsActive, disks.size()); + auto runner = threadPoolCallbackRunner(getActivePartsLoadingThreadPool().get(), "ActiveParts"); std::vector parts_to_load_by_disk(disks.size()); + std::vector> disks_futures; + disks_futures.reserve(disks.size()); + for (size_t i = 0; i < disks.size(); ++i) { const auto & disk_ptr = disks[i]; @@ -1690,7 +1631,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) auto & disk_parts = parts_to_load_by_disk[i]; - pool.scheduleOrThrowOnError([&, disk_ptr]() + disks_futures.push_back(runner([&, disk_ptr]() { for (auto it = disk_ptr->iterateDirectory(relative_data_path); it->isValid(); it->next()) { @@ -1703,38 +1644,31 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) if (auto part_info = MergeTreePartInfo::tryParsePartName(it->name(), format_version)) disk_parts.emplace_back(*part_info, it->name(), disk_ptr); } - }); + }, Priority{0})); } - pool.wait(); + /// For iteration to be completed + /// Any exception will be re-thrown. + for (auto & future : disks_futures) + future.get(); + disks_futures.clear(); PartLoadingTree::PartLoadingInfos parts_to_load; for (auto & disk_parts : parts_to_load_by_disk) std::move(disk_parts.begin(), disk_parts.end(), std::back_inserter(parts_to_load)); auto loading_tree = PartLoadingTree::build(std::move(parts_to_load)); - /// Collect parts by disks' names. - std::map disk_part_map; + + size_t num_parts = 0; + PartLoadingTreeNodes active_parts; /// Collect only "the most covering" parts from the top level of the tree. loading_tree.traverse(/*recursive=*/ false, [&](const auto & node) { - disk_part_map[node->disk->getName()].emplace_back(node); + active_parts.emplace_back(node); }); - size_t num_parts = 0; - std::queue parts_queue; - - for (auto & [disk_name, disk_parts] : disk_part_map) - { - LOG_INFO(log, "Found {} parts for disk '{}' to load", disk_parts.size(), disk_name); - - if (disk_parts.empty()) - continue; - - num_parts += disk_parts.size(); - parts_queue.push(std::move(disk_parts)); - } + num_parts += active_parts.size(); auto part_lock = lockParts(); LOG_TEST(log, "loadDataParts: clearing data_parts_indexes (had {} parts)", data_parts_indexes.size()); @@ -1754,7 +1688,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) if (num_parts > 0) { - auto loaded_parts = loadDataPartsFromDisk(pool, num_parts, parts_queue, settings); + auto loaded_parts = loadDataPartsFromDisk(active_parts); for (const auto & res : loaded_parts) { @@ -1783,10 +1717,12 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) if (settings->in_memory_parts_enable_wal) { - pool.setMaxThreads(disks.size()); std::vector disks_wal_parts(disks.size()); std::mutex wal_init_lock; + std::vector> wal_disks_futures; + wal_disks_futures.reserve(disks.size()); + for (size_t i = 0; i < disks.size(); ++i) { const auto & disk_ptr = disks[i]; @@ -1795,7 +1731,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) auto & disk_wal_parts = disks_wal_parts[i]; - pool.scheduleOrThrowOnError([&, disk_ptr]() + wal_disks_futures.push_back(runner([&, disk_ptr]() { for (auto it = disk_ptr->iterateDirectory(relative_data_path); it->isValid(); it->next()) { @@ -1821,10 +1757,14 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) disk_wal_parts.push_back(std::move(part)); } } - }); + }, Priority{0})); } - pool.wait(); + /// For for iteration to be completed + /// Any exception will be re-thrown. + for (auto & future : wal_disks_futures) + future.get(); + wal_disks_futures.clear(); MutableDataPartsVector parts_from_wal; for (auto & disk_wal_parts : disks_wal_parts) @@ -1925,7 +1865,7 @@ try std::atomic_size_t num_loaded_parts = 0; - auto runner = threadPoolCallbackRunner(OutdatedPartsLoadingThreadPool::get(), "OutdatedParts"); + auto runner = threadPoolCallbackRunner(getOutdatedPartsLoadingThreadPool().get(), "OutdatedParts"); std::vector> parts_futures; while (true) @@ -1938,8 +1878,10 @@ try if (is_async && outdated_data_parts_loading_canceled) { /// Wait for every scheduled task + /// In case of any exception it will be re-thrown and server will be terminated. for (auto & future : parts_futures) - future.wait(); + future.get(); + parts_futures.clear(); LOG_DEBUG(log, "Stopped loading outdated data parts because task was canceled. " @@ -1973,7 +1915,7 @@ try /// Wait for every scheduled task for (auto & future : parts_futures) - future.wait(); + future.get(); LOG_DEBUG(log, "Loaded {} outdated data parts {}", num_loaded_parts, is_async ? "asynchronously" : "synchronously"); @@ -1999,6 +1941,13 @@ void MergeTreeData::waitForOutdatedPartsToBeLoaded() const TSA_NO_THREAD_SAFETY_ if (isStaticStorage()) return; + /// We need to load parts as fast as possible + getOutdatedPartsLoadingThreadPool().enableTurboMode(); + SCOPE_EXIT({ + /// Let's lower the number of threads e.g. for later ATTACH queries to behave as usual + getOutdatedPartsLoadingThreadPool().disableTurboMode(); + }); + LOG_TRACE(log, "Will wait for outdated data parts to be loaded"); std::unique_lock lock(outdated_data_parts_mutex); @@ -2420,20 +2369,15 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t } }; - if (settings->max_part_removal_threads <= 1 || parts_to_remove.size() <= settings->concurrent_part_removal_threshold) + if (parts_to_remove.size() <= settings->concurrent_part_removal_threshold) { remove_single_thread(); return; } /// Parallel parts removal. - size_t num_threads = settings->max_part_removal_threads; - if (!num_threads) - num_threads = getNumberOfPhysicalCPUCores() * 2; - num_threads = std::min(num_threads, parts_to_remove.size()); std::mutex part_names_mutex; - ThreadPool pool(CurrentMetrics::MergeTreePartsCleanerThreads, CurrentMetrics::MergeTreePartsCleanerThreadsActive, - num_threads, num_threads, /* unlimited queue size */ 0); + auto runner = threadPoolCallbackRunner(getPartsCleaningThreadPool().get(), "PartsCleaning"); /// This flag disallow straightforward concurrent parts removal. It's required only in case /// when we have parts on zero-copy disk + at least some of them were mutated. @@ -2453,27 +2397,27 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t LOG_DEBUG( log, "Removing {} parts from filesystem (concurrently): Parts: [{}]", parts_to_remove.size(), fmt::join(parts_to_remove, ", ")); + std::vector> parts_to_remove_futures; + parts_to_remove_futures.reserve(parts_to_remove.size()); + for (const DataPartPtr & part : parts_to_remove) { - pool.scheduleOrThrowOnError([&part, &part_names_mutex, part_names_succeed, thread_group = CurrentThread::getGroup()] + parts_to_remove_futures.push_back(runner([&part, &part_names_mutex, part_names_succeed, thread_group = CurrentThread::getGroup()] { - SCOPE_EXIT_SAFE( - if (thread_group) - CurrentThread::detachFromGroupIfNotDetached(); - ); - if (thread_group) - CurrentThread::attachToGroupIfDetached(thread_group); - asMutableDeletingPart(part)->remove(); if (part_names_succeed) { std::lock_guard lock(part_names_mutex); part_names_succeed->insert(part->name); } - }); + }, Priority{0})); } - pool.wait(); + /// Any exception will be re-thrown. + for (auto & future : parts_to_remove_futures) + future.get(); + parts_to_remove_futures.clear(); + return; } @@ -2544,20 +2488,15 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t return independent_ranges; }; - auto schedule_parts_removal = [this, &pool, &part_names_mutex, part_names_succeed]( + std::vector> part_removal_futures; + + auto schedule_parts_removal = [this, &runner, &part_names_mutex, part_names_succeed, &part_removal_futures]( const MergeTreePartInfo & range, DataPartsVector && parts_in_range) { /// Below, range should be captured by copy to avoid use-after-scope on exception from pool - pool.scheduleOrThrowOnError( - [this, range, &part_names_mutex, part_names_succeed, thread_group = CurrentThread::getGroup(), batch = std::move(parts_in_range)] + part_removal_futures.push_back(runner( + [this, range, &part_names_mutex, part_names_succeed, batch = std::move(parts_in_range)] { - SCOPE_EXIT_SAFE( - if (thread_group) - CurrentThread::detachFromGroupIfNotDetached(); - ); - if (thread_group) - CurrentThread::attachToGroupIfDetached(thread_group); - LOG_TRACE(log, "Removing {} parts in blocks range {}", batch.size(), range.getPartNameForLogs()); for (const auto & part : batch) @@ -2569,7 +2508,7 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t part_names_succeed->insert(part->name); } } - }); + }, Priority{0})); }; RemovalRanges independent_ranges = split_into_independent_ranges(parts_to_remove, /* split_times */ 0); @@ -2632,7 +2571,11 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t LOG_TRACE(log, "Will remove {} big parts separately: {}", excluded_parts.size(), fmt::join(excluded_parts, ", ")); independent_ranges = split_into_independent_ranges(excluded_parts, /* split_times */ 0); - pool.wait(); + + /// Any exception will be re-thrown. + for (auto & future : part_removal_futures) + future.get(); + part_removal_futures.clear(); for (size_t i = 0; i < independent_ranges.infos.size(); ++i) { @@ -2641,7 +2584,10 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t schedule_parts_removal(range, std::move(parts_in_range)); } - pool.wait(); + /// Any exception will be re-thrown. + for (auto & future : part_removal_futures) + future.get(); + part_removal_futures.clear(); if (parts_to_remove.size() != sum_of_ranges + excluded_parts.size()) throw Exception(ErrorCodes::LOGICAL_ERROR, diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 1c41de6fa19..2f254f9a787 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -1519,11 +1519,7 @@ private: size_t max_backoff_ms, size_t max_tries); - std::vector loadDataPartsFromDisk( - ThreadPool & pool, - size_t num_parts, - std::queue & parts_queue, - const MergeTreeSettingsPtr & settings); + std::vector loadDataPartsFromDisk(PartLoadingTreeNodes & parts_to_load); void loadDataPartsFromWAL(MutableDataPartsVector & parts_from_wal); diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 5ea99009756..33aea358078 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -143,8 +143,6 @@ struct Settings; M(Bool, ttl_only_drop_parts, false, "Only drop altogether the expired parts and not partially prune them.", 0) \ M(Bool, materialize_ttl_recalculate_only, false, "Only recalculate ttl info when MATERIALIZE TTL", 0) \ M(Bool, enable_mixed_granularity_parts, true, "Enable parts with adaptive and non adaptive granularity", 0) \ - M(MaxThreads, max_part_loading_threads, 0, "The number of threads to load data parts at startup.", 0) \ - M(MaxThreads, max_part_removal_threads, 0, "The number of threads for concurrent removal of inactive data parts. One is usually enough, but in 'Google Compute Environment SSD Persistent Disks' file removal (unlink) operation is extraordinarily slow and you probably have to increase this number (recommended is up to 16).", 0) \ M(UInt64, concurrent_part_removal_threshold, 100, "Activate concurrent part removal (see 'max_part_removal_threads') only if the number of inactive data parts is at least this.", 0) \ M(UInt64, zero_copy_concurrent_part_removal_max_split_times, 5, "Max recursion depth for splitting independent Outdated parts ranges into smaller subranges (highly not recommended to change)", 0) \ M(Float, zero_copy_concurrent_part_removal_max_postpone_ratio, static_cast(0.05), "Max percentage of top level parts to postpone removal in order to get smaller independent ranges (highly not recommended to change)", 0) \ @@ -192,6 +190,9 @@ struct Settings; M(UInt64, write_ahead_log_bytes_to_fsync, 100ULL * 1024 * 1024, "Obsolete setting, does nothing.", 0) \ M(UInt64, write_ahead_log_interval_ms_to_fsync, 100, "Obsolete setting, does nothing.", 0) \ M(Bool, in_memory_parts_insert_sync, false, "Obsolete setting, does nothing.", 0) \ + M(MaxThreads, max_part_loading_threads, 0, "Obsolete setting, does nothing.", 0) \ + M(MaxThreads, max_part_removal_threads, 0, "Obsolete setting, does nothing.", 0) \ + /// Settings that should not change after the creation of a table. /// NOLINTNEXTLINE #define APPLY_FOR_IMMUTABLE_MERGE_TREE_SETTINGS(M) \ diff --git a/src/Storages/MergeTree/MergeTreeSource.cpp b/src/Storages/MergeTree/MergeTreeSource.cpp index b65f044a13b..69fbdd5a64d 100644 --- a/src/Storages/MergeTree/MergeTreeSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSource.cpp @@ -105,7 +105,7 @@ struct MergeTreeSource::AsyncReadingState AsyncReadingState() { control = std::make_shared(); - callback_runner = threadPoolCallbackRunner(IOThreadPool::get(), "MergeTreeRead"); + callback_runner = threadPoolCallbackRunner(getIOThreadPool().get(), "MergeTreeRead"); } ~AsyncReadingState() diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 2d8aaec0f07..f1a7bcb71a2 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -766,7 +766,7 @@ public: DBMS_DEFAULT_BUFFER_SIZE, configuration_.request_settings, std::nullopt, - threadPoolCallbackRunner(IOThreadPool::get(), "S3ParallelWrite"), + threadPoolCallbackRunner(getIOThreadPool().get(), "S3ParallelWrite"), context->getWriteSettings()), compression_method, 3); diff --git a/src/Storages/System/StorageSystemDetachedParts.cpp b/src/Storages/System/StorageSystemDetachedParts.cpp index 9f80b994051..97af4094e42 100644 --- a/src/Storages/System/StorageSystemDetachedParts.cpp +++ b/src/Storages/System/StorageSystemDetachedParts.cpp @@ -194,7 +194,7 @@ private: futures.push_back( scheduleFromThreadPool( std::move(worker), - IOThreadPool::get(), + getIOThreadPool().get(), "DP_BytesOnDisk")); } diff --git a/tests/queries/0_stateless/00988_parallel_parts_removal.sql b/tests/queries/0_stateless/00988_parallel_parts_removal.sql index bff9bbe6d8d..8f79276782b 100644 --- a/tests/queries/0_stateless/00988_parallel_parts_removal.sql +++ b/tests/queries/0_stateless/00988_parallel_parts_removal.sql @@ -1,6 +1,6 @@ DROP TABLE IF EXISTS mt; -CREATE TABLE mt (x UInt64) ENGINE = MergeTree ORDER BY x SETTINGS max_part_removal_threads = 16, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, old_parts_lifetime = 1, parts_to_delay_insert = 100000, parts_to_throw_insert = 100000; +CREATE TABLE mt (x UInt64) ENGINE = MergeTree ORDER BY x SETTINGS cleanup_delay_period = 1, cleanup_delay_period_random_add = 0, old_parts_lifetime = 1, parts_to_delay_insert = 100000, parts_to_throw_insert = 100000; SYSTEM STOP MERGES mt; diff --git a/tests/queries/0_stateless/00989_parallel_parts_loading.sql b/tests/queries/0_stateless/00989_parallel_parts_loading.sql index 13cd56e1924..a05515cf756 100644 --- a/tests/queries/0_stateless/00989_parallel_parts_loading.sql +++ b/tests/queries/0_stateless/00989_parallel_parts_loading.sql @@ -2,7 +2,7 @@ DROP TABLE IF EXISTS mt; -CREATE TABLE mt (x UInt64) ENGINE = MergeTree ORDER BY x SETTINGS max_part_loading_threads = 16, parts_to_delay_insert = 100000, parts_to_throw_insert = 100000; +CREATE TABLE mt (x UInt64) ENGINE = MergeTree ORDER BY x SETTINGS parts_to_delay_insert = 100000, parts_to_throw_insert = 100000; SYSTEM STOP MERGES mt; diff --git a/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh b/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh index f8f49816479..87153a4bd58 100755 --- a/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh +++ b/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh @@ -11,6 +11,9 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh +# The number of threads removing data parts should be between 1 and 129. +# Because max_parts_cleaning_thread_pool_size is 128 by default + $CLICKHOUSE_CLIENT --allow_deprecated_database_ordinary=1 -nm -q "create database ordinary_$CLICKHOUSE_DATABASE engine=Ordinary" # MergeTree @@ -22,7 +25,7 @@ $CLICKHOUSE_CLIENT -nm -q """ Engine=MergeTree() order by key partition by key%100 - settings max_part_removal_threads=10, concurrent_part_removal_threshold=99, min_bytes_for_wide_part=0; + settings concurrent_part_removal_threshold=99, min_bytes_for_wide_part=0; insert into data_01810 select * from numbers(100); drop table data_01810 settings log_queries=1; @@ -30,7 +33,7 @@ $CLICKHOUSE_CLIENT -nm -q """ -- sometimes the same thread can be used to remove part, due to ThreadPool, -- hence we cannot compare strictly. - select throwIf(not(length(thread_ids) between 1 and 11)) + select throwIf(not(length(thread_ids) between 1 and 129)) from system.query_log where event_date >= yesterday() and @@ -49,7 +52,7 @@ $CLICKHOUSE_CLIENT -nm -q """ Engine=ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/rep_data_01810', '1') order by key partition by key%100 - settings max_part_removal_threads=10, concurrent_part_removal_threshold=99, min_bytes_for_wide_part=0; + settings concurrent_part_removal_threshold=99, min_bytes_for_wide_part=0; SET insert_keeper_max_retries=1000; SET insert_keeper_retry_max_backoff_ms=10; @@ -60,7 +63,7 @@ $CLICKHOUSE_CLIENT -nm -q """ -- sometimes the same thread can be used to remove part, due to ThreadPool, -- hence we cannot compare strictly. - select throwIf(not(length(thread_ids) between 1 and 11)) + select throwIf(not(length(thread_ids) between 1 and 129)) from system.query_log where event_date >= yesterday() and diff --git a/tests/queries/0_stateless/02432_s3_parallel_parts_cleanup.sql b/tests/queries/0_stateless/02432_s3_parallel_parts_cleanup.sql index 88fb2cdf9b1..5b9342972f4 100644 --- a/tests/queries/0_stateless/02432_s3_parallel_parts_cleanup.sql +++ b/tests/queries/0_stateless/02432_s3_parallel_parts_cleanup.sql @@ -8,7 +8,7 @@ drop table if exists rmt2; -- Disable compact parts, because we need hardlinks in mutations. create table rmt (n int, m int, k int) engine=ReplicatedMergeTree('/test/02432/{database}', '1') order by tuple() settings storage_policy = 's3_cache', allow_remote_fs_zero_copy_replication=1, - max_part_removal_threads=10, concurrent_part_removal_threshold=1, cleanup_delay_period=1, cleanup_delay_period_random_add=1, + concurrent_part_removal_threshold=1, cleanup_delay_period=1, cleanup_delay_period_random_add=1, max_replicated_merges_in_queue=0, max_replicated_mutations_in_queue=0, min_bytes_for_wide_part=0, min_rows_for_wide_part=0; insert into rmt(n, m) values (1, 42); @@ -38,7 +38,7 @@ select count(), sum(n), sum(m) from rmt; -- New table can assign merges/mutations and can remove old parts create table rmt2 (n int, m int, k String) engine=ReplicatedMergeTree('/test/02432/{database}', '2') order by tuple() settings storage_policy = 's3_cache', allow_remote_fs_zero_copy_replication=1, - max_part_removal_threads=10, concurrent_part_removal_threshold=1, cleanup_delay_period=1, cleanup_delay_period_random_add=1, + concurrent_part_removal_threshold=1, cleanup_delay_period=1, cleanup_delay_period_random_add=1, min_bytes_for_wide_part=0, min_rows_for_wide_part=0, max_replicated_merges_in_queue=1, old_parts_lifetime=0; @@ -66,4 +66,3 @@ drop table rmt2; system flush logs; select count() > 0 from system.text_log where yesterday() <= event_date and logger_name like '%' || currentDatabase() || '%' and message like '%Removing % parts from filesystem (concurrently): Parts:%'; select count() > 1, countDistinct(thread_id) > 1 from system.text_log where yesterday() <= event_date and logger_name like '%' || currentDatabase() || '%' and message like '%Removing % parts in blocks range%'; - From 36d298ceef12daf6689ae648b7efdedf2ee83d79 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 6 Jun 2023 14:45:58 +0200 Subject: [PATCH 0529/1072] Fix commit for DiskObjectStorage (#50599) --- src/Disks/ObjectStorages/DiskObjectStorage.cpp | 3 ++- src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index 129f1ab1ef7..005d115a277 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -596,7 +596,8 @@ void DiskObjectStorage::writeFileUsingBlobWritingFunction(const String & path, W { LOG_TEST(log, "Write file: {}", path); auto transaction = createObjectStorageTransaction(); - return transaction->writeFileUsingBlobWritingFunction(path, mode, std::move(write_blob_function)); + transaction->writeFileUsingBlobWritingFunction(path, mode, std::move(write_blob_function)); + transaction->commit(); } void DiskObjectStorage::applyNewSettings( diff --git a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp index 257a6fdf2ea..bd66ada492f 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp @@ -710,8 +710,6 @@ void DiskObjectStorageTransaction::writeFileUsingBlobWritingFunction( metadata_transaction->createMetadataFile(path, blob_name, object_size); else metadata_transaction->addBlobToMetadata(path, blob_name, object_size); - - metadata_transaction->commit(); } From f096bfcad29b95ea02f720f0becd5931c5b7eb37 Mon Sep 17 00:00:00 2001 From: Victor Krasnov Date: Tue, 6 Jun 2023 15:47:34 +0300 Subject: [PATCH 0530/1072] Adjust 00569_parse_date_time_best_effort and 01543_parse_datetime_besteffort_or_null_empty_string to the new feature --- .../00569_parse_date_time_best_effort.reference | 8 ++++---- .../0_stateless/00569_parse_date_time_best_effort.sql | 8 ++++---- ...543_parse_datetime_besteffort_or_null_empty_string.sql | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/queries/0_stateless/00569_parse_date_time_best_effort.reference b/tests/queries/0_stateless/00569_parse_date_time_best_effort.reference index ad7c17b7717..0729a7628f2 100644 --- a/tests/queries/0_stateless/00569_parse_date_time_best_effort.reference +++ b/tests/queries/0_stateless/00569_parse_date_time_best_effort.reference @@ -2,8 +2,8 @@ 0 ᴺᵁᴸᴸ 1970-01-01 00:00:00 0000 ᴺᵁᴸᴸ 1970-01-01 00:00:00 - 00:00:00 2000-01-01 00:00:00 2000-01-01 00:00:00 - 01:00:00 2000-01-01 01:00:00 2000-01-01 01:00:00 + 2000-01-01 00:00:00 2000-01-01 00:00:00 2000-01-01 00:00:00 + 2000-01-01 01:00:00 2000-01-01 01:00:00 2000-01-01 01:00:00 02/01/17 010203 MSK 2017-01-01 22:02:03 2017-01-01 22:02:03 02/01/17 010203 MSK+0100 2017-01-01 21:02:03 2017-01-01 21:02:03 02/01/17 010203 UTC+0300 2017-01-01 22:02:03 2017-01-01 22:02:03 @@ -11,13 +11,13 @@ 02/01/1970 010203Z 1970-01-02 01:02:03 1970-01-02 01:02:03 02/01/70 010203Z 1970-01-02 01:02:03 1970-01-02 01:02:03 11 Feb 2018 06:40:50 +0300 2018-02-11 03:40:50 2018-02-11 03:40:50 - 17 Apr 2 1:2:3 2000-04-17 01:02:03 2000-04-17 01:02:03 + 17 Apr 2000 2 1:2:3 2000-04-17 01:02:03 2000-04-17 01:02:03 19700102 01:00:00 1970-01-02 01:00:00 1970-01-02 01:00:00 1970010201:00:00 ᴺᵁᴸᴸ 1970-01-01 00:00:00 19700102010203 1970-01-02 01:02:03 1970-01-02 01:02:03 19700102010203Z 1970-01-02 01:02:03 1970-01-02 01:02:03 1970/01/02 010203Z 1970-01-02 01:02:03 1970-01-02 01:02:03 - 20 2000-01-20 00:00:00 2000-01-20 00:00:00 + 20 2000 2000-01-20 00:00:00 2000-01-20 00:00:00 201 ᴺᵁᴸᴸ 1970-01-01 00:00:00 20160101 2016-01-01 00:00:00 2016-01-01 00:00:00 2016-01-01 2016-01-01 00:00:00 2016-01-01 00:00:00 diff --git a/tests/queries/0_stateless/00569_parse_date_time_best_effort.sql b/tests/queries/0_stateless/00569_parse_date_time_best_effort.sql index 5f71efa1485..511addb4e4d 100644 --- a/tests/queries/0_stateless/00569_parse_date_time_best_effort.sql +++ b/tests/queries/0_stateless/00569_parse_date_time_best_effort.sql @@ -7,8 +7,8 @@ FROM SELECT arrayJoin([ '0', '0000', -'00:00:00', -'01:00:00', +'2000-01-01 00:00:00', +'2000-01-01 01:00:00', '02/01/17 010203 MSK', '02/01/17 010203 MSK+0100', '02/01/17 010203 UTC+0300', @@ -16,13 +16,13 @@ FROM '02/01/1970 010203Z', '02/01/70 010203Z', '11 Feb 2018 06:40:50 +0300', -'17 Apr 2 1:2:3', +'17 Apr 2000 2 1:2:3', '19700102 01:00:00', '1970010201:00:00', '19700102010203', '19700102010203Z', '1970/01/02 010203Z', -'20', +'20 2000', '201', '20160101', '2016-01-01', diff --git a/tests/queries/0_stateless/01543_parse_datetime_besteffort_or_null_empty_string.sql b/tests/queries/0_stateless/01543_parse_datetime_besteffort_or_null_empty_string.sql index ad14c4ede06..7098028963d 100644 --- a/tests/queries/0_stateless/01543_parse_datetime_besteffort_or_null_empty_string.sql +++ b/tests/queries/0_stateless/01543_parse_datetime_besteffort_or_null_empty_string.sql @@ -4,7 +4,7 @@ SELECT parseDateTimeBestEffortOrNull('2020-01-01 11:01:01 am'); SELECT parseDateTimeBestEffortOrNull('2020-01-01 11:01:01 pm'); SELECT parseDateTimeBestEffortOrNull('2020-01-01 12:01:01 am'); SELECT parseDateTimeBestEffortOrNull('2020-01-01 12:01:01 pm'); -SELECT parseDateTimeBestEffortOrNull('01:01:01'); +SELECT parseDateTimeBestEffortOrNull('2000-01-01 01:01:01'); SELECT parseDateTimeBestEffortOrNull('20100'); SELECT parseDateTimeBestEffortOrNull('0100:0100:0000'); SELECT parseDateTimeBestEffortOrNull('x'); From a1f3bd9e231ade5a88e3bb53dd4c96caa583d835 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 6 Jun 2023 14:55:17 +0200 Subject: [PATCH 0531/1072] Fix reads --- .../AzureBlobStorage/AzureObjectStorage.cpp | 15 +- .../ObjectStorages/ObjectStorageIterator.cpp | 2 +- .../ObjectStorages/ObjectStorageIterator.h | 12 +- .../ObjectStorageIteratorAsync.cpp | 41 +++-- .../ObjectStorageIteratorAsync.h | 8 +- src/Storages/StorageAzure.cpp | 25 ++- .../test_storage_azure_blob_storage/test.py | 145 ++++++++++++++++++ 7 files changed, 216 insertions(+), 32 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 23a0da39dd3..07173e65448 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -52,15 +52,18 @@ public: options.Prefix = path_prefix; options.PageSizeHint = static_cast(max_list_size); + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "ITER PREFIX {}", path_prefix); } private: bool getBatchAndCheckNext(RelativePathsWithMetadata & batch) override { + batch.clear(); auto outcome = client->ListBlobs(options); auto blob_list_response = client->ListBlobs(options); auto blobs_list = blob_list_response.Blobs; + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "BLOB LIST SIZE {}", blobs_list.size()); for (const auto & blob : blobs_list) { batch.emplace_back( @@ -73,11 +76,15 @@ private: {}}); } - options.ContinuationToken = blob_list_response.NextPageToken; - if (blob_list_response.HasPage()) - return true; + if (!blob_list_response.NextPageToken.HasValue() || blob_list_response.NextPageToken.Value().empty()) + { + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "RETURN FALSE {}", blobs_list.size()); + return false; + } - return false; + options.ContinuationToken = blob_list_response.NextPageToken; + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "RETURN TRUE {}", blobs_list.size()); + return true; } std::shared_ptr client; diff --git a/src/Disks/ObjectStorages/ObjectStorageIterator.cpp b/src/Disks/ObjectStorages/ObjectStorageIterator.cpp index 188b743958c..72ec6e0e500 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIterator.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageIterator.cpp @@ -9,7 +9,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -RelativePathWithMetadata ObjectStorageIteratorFromList::current() const +RelativePathWithMetadata ObjectStorageIteratorFromList::current() { if (!isValid()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to access invalid iterator"); diff --git a/src/Disks/ObjectStorages/ObjectStorageIterator.h b/src/Disks/ObjectStorages/ObjectStorageIterator.h index e562d92e1fb..2ff5ce60acc 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIterator.h +++ b/src/Disks/ObjectStorages/ObjectStorageIterator.h @@ -11,9 +11,9 @@ class IObjectStorageIterator public: virtual void next() = 0; virtual void nextBatch() = 0; - virtual bool isValid() const = 0; - virtual RelativePathWithMetadata current() const = 0; - virtual RelativePathsWithMetadata currentBatch() const = 0; + virtual bool isValid() = 0; + virtual RelativePathWithMetadata current() = 0; + virtual RelativePathsWithMetadata currentBatch() = 0; virtual size_t getAccumulatedSize() const = 0; virtual ~IObjectStorageIterator() = default; @@ -41,14 +41,14 @@ public: batch_iterator = batch.end(); } - bool isValid() const override + bool isValid() override { return batch_iterator != batch.end(); } - RelativePathWithMetadata current() const override; + RelativePathWithMetadata current() override; - RelativePathsWithMetadata currentBatch() const override + RelativePathsWithMetadata currentBatch() override { return batch; } diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp index c97a941f7be..fd6452b7c2a 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp @@ -1,5 +1,7 @@ #include +#include + namespace DB { @@ -13,17 +15,26 @@ void IObjectStorageIteratorAsync::nextBatch() std::lock_guard lock(mutex); if (!is_finished) { - if (outcome_future.valid()) + if (!is_initialized) { - BatchAndHasNext next_batch = outcome_future.get(); - current_batch = std::move(next_batch.batch); - accumulated_size.fetch_add(current_batch.size(), std::memory_order_relaxed); - current_batch_iterator = current_batch.begin(); - if (next_batch.has_next) - outcome_future = scheduleBatch(); - else - is_finished = true; + outcome_future = scheduleBatch(); + is_initialized = true; } + + BatchAndHasNext next_batch = outcome_future.get(); + current_batch = std::move(next_batch.batch); + accumulated_size.fetch_add(current_batch.size(), std::memory_order_relaxed); + current_batch_iterator = current_batch.begin(); + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "HAS NEXT {}", next_batch.has_next); + if (next_batch.has_next) + outcome_future = scheduleBatch(); + else + is_finished = true; + } + else + { + current_batch.clear(); + current_batch_iterator = current_batch.begin(); } } @@ -62,12 +73,15 @@ std::future IObjectStorageIterator } -bool IObjectStorageIteratorAsync::isValid() const +bool IObjectStorageIteratorAsync::isValid() { + if (!is_initialized) + nextBatch(); + return current_batch_iterator != current_batch.end(); } -RelativePathWithMetadata IObjectStorageIteratorAsync::current() const +RelativePathWithMetadata IObjectStorageIteratorAsync::current() { if (!isValid()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to access invalid iterator"); @@ -76,9 +90,12 @@ RelativePathWithMetadata IObjectStorageIteratorAsync::current() const } -RelativePathsWithMetadata IObjectStorageIteratorAsync::currentBatch() const +RelativePathsWithMetadata IObjectStorageIteratorAsync::currentBatch() { std::lock_guard lock(mutex); + if (!isValid()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to access invalid iterator"); + return current_batch; } diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h index 3f3f41e7e77..a2b06da9a91 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h @@ -19,14 +19,13 @@ public: : list_objects_pool(threads_metric, threads_active_metric, 1) , list_objects_scheduler(threadPoolCallbackRunner(list_objects_pool, thread_name)) { - nextBatch(); } void next() override; void nextBatch() override; - bool isValid() const override; - RelativePathWithMetadata current() const override; - RelativePathsWithMetadata currentBatch() const override; + bool isValid() override; + RelativePathWithMetadata current() override; + RelativePathsWithMetadata currentBatch() override; size_t getAccumulatedSize() const override; ~IObjectStorageIteratorAsync() override @@ -46,6 +45,7 @@ protected: std::future scheduleBatch(); + bool is_initialized{false}; bool is_finished{false}; mutable std::mutex mutex; diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index d83dc90e6ed..e0a1d8c514e 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -858,6 +858,7 @@ StorageAzureSource::Iterator::Iterator( } else { + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "GLOBS BRANCH"); const String key_prefix = blob_path_with_globs->substr(0, blob_path_with_globs->find_first_of("*?{")); /// We don't have to list bucket, because there is no asterisks. @@ -868,7 +869,11 @@ StorageAzureSource::Iterator::Iterator( return; } + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "KEY PREFIX {}", key_prefix); object_storage_iterator = object_storage->iterate(key_prefix); + + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "BLOBS BLOBS{}", *blob_path_with_globs); + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "REGEXP PATTERN {}", makeRegexpPatternFromGlobs(*blob_path_with_globs)); matcher = std::make_unique(makeRegexpPatternFromGlobs(*blob_path_with_globs)); if (!matcher->ok()) @@ -898,27 +903,37 @@ RelativePathWithMetadata StorageAzureSource::Iterator::next() } else { + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "GLOBS IN NEXt"); if (!blobs_with_metadata || index >= blobs_with_metadata->size()) { + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "INITIALIZING BLOBS BATCH"); RelativePathsWithMetadata new_batch; while (new_batch.empty()) { if (object_storage_iterator->isValid()) { + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "ITERATOR VALID FETCHING BATCH"); new_batch = object_storage_iterator->currentBatch(); + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "BATCH SIZE {}", new_batch.size()); object_storage_iterator->nextBatch(); } else { + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "ITERATOR INVALID"); is_finished = true; return {}; } - for (auto it = new_batch.begin(); it != new_batch.end(); ++it) + for (auto it = new_batch.begin(); it != new_batch.end();) { + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "ITERATOR FILTER {} MATCH {}", it->relative_path, re2::RE2::FullMatch(it->relative_path, *matcher)); if (!recursive && !re2::RE2::FullMatch(it->relative_path, *matcher)) it = new_batch.erase(it); + else + ++it; } + + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "NEW BATCH AFTER FILTEr {}", new_batch.size()); } index.store(0, std::memory_order_relaxed); @@ -1092,11 +1107,11 @@ String StorageAzureSource::getName() const StorageAzureSource::ReaderHolder StorageAzureSource::createReader() { auto [current_key, info] = file_iterator->next(); + LOG_DEBUG(log, "KEY {} SIZE {}", current_key, info.size_bytes); if (current_key.empty()) return {}; size_t object_size = info.size_bytes != 0 ? info.size_bytes : object_storage->getObjectMetadata(current_key).size_bytes; - LOG_DEBUG(log, "SIZE {}", object_size); auto compression_method = chooseCompressionMethod(current_key, compression_hint); auto read_buf = createAzureReadBuffer(current_key, object_size); @@ -1134,9 +1149,9 @@ std::unique_ptr StorageAzureSource::createAzureReadBuffer(const Stri //auto download_buffer_size = getContext()->getSettings().max_download_buffer_size; //const bool object_too_small = object_size <= 2 * download_buffer_size; - // Create a read buffer that will prefetch the first ~1 MB of the file. - // When reading lots of tiny files, this prefetching almost doubles the throughput. - // For bigger files, parallel reading is more useful. + ///// Create a read buffer that will prefetch the first ~1 MB of the file. + ///// When reading lots of tiny files, this prefetching almost doubles the throughput. + ///// For bigger files, parallel reading is more useful. //if (object_too_small && read_settings.remote_fs_method == RemoteFSReadMethod::threadpool) //{ // LOG_TRACE(log, "Downloading object {} of size {} from S3 with initial prefetch", key, object_size); diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index bcf5d068057..b5cd7cb4566 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -62,6 +62,20 @@ def get_azure_file_content(filename): download_stream = blob_client.download_blob() return download_stream.readall().decode("utf-8") +def put_azure_file_content(filename, data): + container_name = "cont" + connection_string = "DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;" + blob_service_client = BlobServiceClient.from_connection_string(connection_string) + try: + container_client = blob_service_client.create_container(container_name) + except: + container_client = blob_service_client.get_container_client(container_name) + + blob_client = container_client.get_blob_client(filename) + buf = io.BytesIO(data) + blob_client.upload_blob(buf) + + def test_create_table_connection_string(cluster): node = cluster.instances["node"] @@ -223,3 +237,134 @@ def test_create_new_files_on_insert(cluster): assert int(result) == 60 azure_query(node, f"drop table test_multiple_inserts") + +def test_overwrite(cluster): + + node = cluster.instances["node"] + + azure_query(node, f"create table test_overwrite(a Int32, b String) ENGINE = Azure(azure_conf2, container='cont', blob_path='test_parquet_overwrite', format='Parquet')") + azure_query(node, "truncate table test_overwrite") + + azure_query(node, + f"insert into test_overwrite select number, randomString(100) from numbers(50) settings azure_truncate_on_insert=1" + ) + node.query_and_get_error( + f"insert into test_overwrite select number, randomString(100) from numbers(100)" + ) + azure_query(node, + f"insert into test_overwrite select number, randomString(100) from numbers(200) settings azure_truncate_on_insert=1" + ) + + result = azure_query(node, f"select count() from test_overwrite") + assert int(result) == 200 + +def test_insert_with_path_with_globs(cluster): + node = cluster.instances["node"] + azure_query(node, f"create table test_insert_globs(a Int32, b String) ENGINE = Azure(azure_conf2, container='cont', blob_path='test_insert_with_globs*', format='Parquet')") + node.query_and_get_error( + f"insert into table function test_insert_globs SELECT number, randomString(100) FROM numbers(500)" + ) + +def test_put_get_with_globs(cluster): + # type: (ClickHouseCluster) -> None + unique_prefix = random.randint(1, 10000) + node = cluster.instances["node"] # type: ClickHouseInstance + table_format = "column1 UInt32, column2 UInt32, column3 UInt32" + max_path = "" + for i in range(10): + for j in range(10): + path = "{}/{}_{}/{}.csv".format( + unique_prefix, i, random.choice(["a", "b", "c", "d"]), j + ) + max_path = max(path, max_path) + values = f"({i},{j},{i + j})" + + azure_query(node, f"CREATE TABLE test_{i}_{j} ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='{path}', format='CSV')") + + query = f"insert into test_{i}_{j} VALUES {values}" + azure_query(node, query) + + + azure_query(node, f"CREATE TABLE test_glob_select ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv', format='CSV')") + query = "select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from test_glob_select" + assert azure_query(node, query).splitlines() == [ + "450\t450\t900\t0.csv\t{bucket}/{max_path}".format( + bucket='cont', max_path=max_path + ) + ] + +def test_azure_glob_scheherazade(cluster): + node = cluster.instances["node"] # type: ClickHouseInstance + table_format = "column1 UInt32, column2 UInt32, column3 UInt32" + values = "(1, 1, 1)" + nights_per_job = 1001 // 30 + jobs = [] + for night in range(0, 1001, nights_per_job): + + def add_tales(start, end): + for i in range(start, end): + path = "night_{}/tale.csv".format(i) + unique_num = random.randint(1, 10000) + azure_query(node, f"CREATE TABLE test_{i}_{unique_num} ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='{path}', format='CSV')") + query = f"insert into test_{i}_{unique_num} VALUES {values}" + azure_query(node, query) + + jobs.append( + threading.Thread( + target=add_tales, args=(night, min(night + nights_per_job, 1001)) + ) + ) + jobs[-1].start() + + for job in jobs: + job.join() + + + azure_query(node, f"CREATE TABLE test_glob_select_scheherazade ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='night_*/tale.csv', format='CSV')") + query = "select count(), sum(column1), sum(column2), sum(column3) from test_glob_select_scheherazade" + assert azure_query(node, query).splitlines() == ["1001\t1001\t1001\t1001"] + +@pytest.mark.parametrize( + "extension,method", + [pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz")], +) +def test_storage_azure_get_gzip(cluster, extension, method): + node = cluster.instances["node"] + filename = f"test_get_gzip.{extension}" + name = f"test_get_gzip_{extension}" + data = [ + "Sophia Intrieri,55", + "Jack Taylor,71", + "Christopher Silva,66", + "Clifton Purser,35", + "Richard Aceuedo,43", + "Lisa Hensley,31", + "Alice Wehrley,1", + "Mary Farmer,47", + "Samara Ramirez,19", + "Shirley Lloyd,51", + "Santos Cowger,0", + "Richard Mundt,88", + "Jerry Gonzalez,15", + "Angela James,10", + "Norman Ortega,33", + "", + ] + azure_query(node, f"DROP TABLE IF EXISTS {name}") + + buf = io.BytesIO() + compressed = gzip.GzipFile(fileobj=buf, mode="wb") + compressed.write(("\n".join(data)).encode()) + compressed.close() + put_azure_file_content(filename, buf.getvalue()) + + azure_query( + node, + f"""CREATE TABLE {name} (name String, id UInt32) ENGINE = Azure( + azure_conf2, container='cont', blob_path ='{filename}', + format='CSV', + compression='{method}')""", + ) + + assert azure_query(node, f"SELECT sum(id) FROM {name}").splitlines() == ["565"] + azure_query(node, f"DROP TABLE {name}") From e054fbccd83dd20a5f748b62e3c40002eb95c551 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 6 Jun 2023 13:09:53 +0000 Subject: [PATCH 0532/1072] Automatic style fix --- .../test_storage_azure_blob_storage/test.py | 52 +++++++++++++------ 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 221005e414b..81560fb0da1 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -62,6 +62,7 @@ def get_azure_file_content(filename): download_stream = blob_client.download_blob() return download_stream.readall().decode("utf-8") + def put_azure_file_content(filename, data): container_name = "cont" connection_string = "DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;" @@ -76,7 +77,6 @@ def put_azure_file_content(filename, data): blob_client.upload_blob(buf) - def test_create_table_connection_string(cluster): node = cluster.instances["node"] azure_query( @@ -243,33 +243,43 @@ def test_create_new_files_on_insert(cluster): azure_query(node, f"drop table test_multiple_inserts") -def test_overwrite(cluster): +def test_overwrite(cluster): node = cluster.instances["node"] - azure_query(node, f"create table test_overwrite(a Int32, b String) ENGINE = Azure(azure_conf2, container='cont', blob_path='test_parquet_overwrite', format='Parquet')") + azure_query( + node, + f"create table test_overwrite(a Int32, b String) ENGINE = Azure(azure_conf2, container='cont', blob_path='test_parquet_overwrite', format='Parquet')", + ) azure_query(node, "truncate table test_overwrite") - azure_query(node, - f"insert into test_overwrite select number, randomString(100) from numbers(50) settings azure_truncate_on_insert=1" + azure_query( + node, + f"insert into test_overwrite select number, randomString(100) from numbers(50) settings azure_truncate_on_insert=1", ) node.query_and_get_error( f"insert into test_overwrite select number, randomString(100) from numbers(100)" ) - azure_query(node, - f"insert into test_overwrite select number, randomString(100) from numbers(200) settings azure_truncate_on_insert=1" + azure_query( + node, + f"insert into test_overwrite select number, randomString(100) from numbers(200) settings azure_truncate_on_insert=1", ) result = azure_query(node, f"select count() from test_overwrite") assert int(result) == 200 + def test_insert_with_path_with_globs(cluster): node = cluster.instances["node"] - azure_query(node, f"create table test_insert_globs(a Int32, b String) ENGINE = Azure(azure_conf2, container='cont', blob_path='test_insert_with_globs*', format='Parquet')") + azure_query( + node, + f"create table test_insert_globs(a Int32, b String) ENGINE = Azure(azure_conf2, container='cont', blob_path='test_insert_with_globs*', format='Parquet')", + ) node.query_and_get_error( f"insert into table function test_insert_globs SELECT number, randomString(100) FROM numbers(500)" ) + def test_put_get_with_globs(cluster): # type: (ClickHouseCluster) -> None unique_prefix = random.randint(1, 10000) @@ -284,20 +294,26 @@ def test_put_get_with_globs(cluster): max_path = max(path, max_path) values = f"({i},{j},{i + j})" - azure_query(node, f"CREATE TABLE test_{i}_{j} ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='{path}', format='CSV')") + azure_query( + node, + f"CREATE TABLE test_{i}_{j} ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='{path}', format='CSV')", + ) query = f"insert into test_{i}_{j} VALUES {values}" azure_query(node, query) - - azure_query(node, f"CREATE TABLE test_glob_select ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv', format='CSV')") + azure_query( + node, + f"CREATE TABLE test_glob_select ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv', format='CSV')", + ) query = "select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from test_glob_select" assert azure_query(node, query).splitlines() == [ "450\t450\t900\t0.csv\t{bucket}/{max_path}".format( - bucket='cont', max_path=max_path + bucket="cont", max_path=max_path ) ] + def test_azure_glob_scheherazade(cluster): node = cluster.instances["node"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -310,7 +326,10 @@ def test_azure_glob_scheherazade(cluster): for i in range(start, end): path = "night_{}/tale.csv".format(i) unique_num = random.randint(1, 10000) - azure_query(node, f"CREATE TABLE test_{i}_{unique_num} ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='{path}', format='CSV')") + azure_query( + node, + f"CREATE TABLE test_{i}_{unique_num} ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='{path}', format='CSV')", + ) query = f"insert into test_{i}_{unique_num} VALUES {values}" azure_query(node, query) @@ -324,11 +343,14 @@ def test_azure_glob_scheherazade(cluster): for job in jobs: job.join() - - azure_query(node, f"CREATE TABLE test_glob_select_scheherazade ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='night_*/tale.csv', format='CSV')") + azure_query( + node, + f"CREATE TABLE test_glob_select_scheherazade ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='night_*/tale.csv', format='CSV')", + ) query = "select count(), sum(column1), sum(column2), sum(column3) from test_glob_select_scheherazade" assert azure_query(node, query).splitlines() == ["1001\t1001\t1001\t1001"] + @pytest.mark.parametrize( "extension,method", [pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz")], From 5ffbe2d9d4de6e47268be38ac84e5de45faded49 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 6 Jun 2023 10:14:31 -0300 Subject: [PATCH 0533/1072] Update docs/en/sql-reference/data-types/ipv6.md Co-authored-by: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> --- docs/en/sql-reference/data-types/ipv6.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/data-types/ipv6.md b/docs/en/sql-reference/data-types/ipv6.md index 284a1f80854..97959308b58 100644 --- a/docs/en/sql-reference/data-types/ipv6.md +++ b/docs/en/sql-reference/data-types/ipv6.md @@ -6,7 +6,7 @@ sidebar_label: IPv6 ## IPv6 -IPv6 addresses. Stored in 16 bytes as UInt128. +IPv6 addresses. Stored in 16 bytes as UInt128 big-endian. ### Basic Usage From a96c1ea86f1fa662e8e775faeaf4b3a8053c6eb2 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 6 Jun 2023 15:15:51 +0200 Subject: [PATCH 0534/1072] Fix --- src/Interpreters/Cache/FileSegment.cpp | 2 +- src/Interpreters/Cache/Metadata.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/Cache/FileSegment.cpp b/src/Interpreters/Cache/FileSegment.cpp index 1b7fe54a769..7b82c58080c 100644 --- a/src/Interpreters/Cache/FileSegment.cpp +++ b/src/Interpreters/Cache/FileSegment.cpp @@ -381,7 +381,7 @@ void FileSegment::write(const char * from, size_t size, size_t offset) const auto file_size = fs::file_size(file_segment_path); chassert(downloaded_size <= file_size); chassert(reserved_size >= file_size); - chassert(file_size <= range().right + 1); + chassert(file_size <= range().size()); if (downloaded_size != file_size) downloaded_size = file_size; } diff --git a/src/Interpreters/Cache/Metadata.cpp b/src/Interpreters/Cache/Metadata.cpp index 9dff77e2af8..4794d1b3264 100644 --- a/src/Interpreters/Cache/Metadata.cpp +++ b/src/Interpreters/Cache/Metadata.cpp @@ -370,8 +370,8 @@ KeyMetadata::iterator LockedKey::removeFileSegment(size_t offset, const FileSegm bool exists = fs::exists(path); if (exists) { - LOG_TEST(log, "Removed file segment at path: {}", path); fs::remove(path); + LOG_TEST(log, "Removed file segment at path: {}", path); } else if (file_segment->downloaded_size) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected path {} to exist"); From 9ae4d929eacac5ae4681cce91ec2f65b198cf3c9 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky <43110995+evillique@users.noreply.github.com> Date: Tue, 6 Jun 2023 15:16:18 +0200 Subject: [PATCH 0535/1072] Update src/Interpreters/tests/gtest_convertFieldToType.cpp --- src/Interpreters/tests/gtest_convertFieldToType.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/tests/gtest_convertFieldToType.cpp b/src/Interpreters/tests/gtest_convertFieldToType.cpp index 5421c192ac7..f4de36cbecc 100644 --- a/src/Interpreters/tests/gtest_convertFieldToType.cpp +++ b/src/Interpreters/tests/gtest_convertFieldToType.cpp @@ -56,7 +56,7 @@ TEST_P(ConvertFieldToTypeTest, convert) } } -// Basically nuber of seconds in a day, works for UTC here +// Basically, the number of seconds in a day works for UTC here const long long int Day = 24 * 60 * 60; // 123 is arbitrary value here From 4ca902ddddc36dbe54b7a6e0f4a7e8185d0b35ca Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 6 Jun 2023 10:16:56 -0300 Subject: [PATCH 0536/1072] Update docs/en/sql-reference/aggregate-functions/index.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: János Benjamin Antal --- docs/en/sql-reference/aggregate-functions/index.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/index.md b/docs/en/sql-reference/aggregate-functions/index.md index a3808335168..050142fdd99 100644 --- a/docs/en/sql-reference/aggregate-functions/index.md +++ b/docs/en/sql-reference/aggregate-functions/index.md @@ -76,9 +76,12 @@ FROM t_null_big Also you can use [Tuple](../data-types/tuple.md) to change NULL skipping behavior. ```sql -select groupArray(b), groupArray(tuple(b)).1 from t_null_big; +SELECT + groupArray(y), + groupArray(tuple(y)).1 +FROM t_null_big; -┌─groupArray(b)─┬─tupleElement(groupArray(tuple(b)), 1)─┐ +┌─groupArray(y)─┬─tupleElement(groupArray(tuple(y)), 1)─┐ │ [2,2,3] │ [2,NULL,2,3,NULL] │ └───────────────┴───────────────────────────────────────┘ ``` From 5d7b8b3d13502a2eb842dfbfe3b1b704b638badb Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 6 Jun 2023 10:17:04 -0300 Subject: [PATCH 0537/1072] Update docs/en/sql-reference/aggregate-functions/index.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: János Benjamin Antal --- docs/en/sql-reference/aggregate-functions/index.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/index.md b/docs/en/sql-reference/aggregate-functions/index.md index 050142fdd99..25f9d05a76c 100644 --- a/docs/en/sql-reference/aggregate-functions/index.md +++ b/docs/en/sql-reference/aggregate-functions/index.md @@ -85,6 +85,3 @@ FROM t_null_big; │ [2,2,3] │ [2,NULL,2,3,NULL] │ └───────────────┴───────────────────────────────────────┘ ``` - - - From 2a9792cebd6e15818a65bd6566cf683bcb623225 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 6 Jun 2023 10:17:21 -0300 Subject: [PATCH 0538/1072] Update docs/en/sql-reference/aggregate-functions/reference/argmax.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: János Benjamin Antal --- docs/en/sql-reference/aggregate-functions/reference/argmax.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmax.md b/docs/en/sql-reference/aggregate-functions/reference/argmax.md index 8a84f361589..5e80131df4c 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmax.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmax.md @@ -90,7 +90,7 @@ select (argMax((a, b), b) as t).1 argMaxA, t.2 argMaxB from test; select argMax(a, b), max(b) from test where a is Null and b is Null; ┌─argMax(a, b)─┬─max(b)─┐ -│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -- Nulls are not skipped because only Null values are available +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -- All aggregated rows contains at least one `NULL` value because of the filter, so all rows are skipped, therefore the result will be `NULL` └──────────────┴────────┘ select argMax(a, (b,a)) from test; From cda9535554fa28c76d11115fdf276c25e0449f04 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 6 Jun 2023 10:17:46 -0300 Subject: [PATCH 0539/1072] Update docs/en/sql-reference/aggregate-functions/index.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: János Benjamin Antal --- docs/en/sql-reference/aggregate-functions/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/index.md b/docs/en/sql-reference/aggregate-functions/index.md index 25f9d05a76c..ea270e83a3c 100644 --- a/docs/en/sql-reference/aggregate-functions/index.md +++ b/docs/en/sql-reference/aggregate-functions/index.md @@ -73,7 +73,7 @@ FROM t_null_big └────────────────────┴─────────────────────┘ ``` -Also you can use [Tuple](../data-types/tuple.md) to change NULL skipping behavior. +Also you can use [Tuple](../data-types/tuple.md) to work around NULL skipping behavior. The a `Tuple` that contains only a `NULL` value is not `NULL`, so the aggregate functions won't skip that row because of that `NULL` value. ```sql SELECT From 1a517bb332f01da3b5b64dadbb481339b6bdf7ac Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 6 Jun 2023 10:17:55 -0300 Subject: [PATCH 0540/1072] Update docs/en/sql-reference/aggregate-functions/reference/argmax.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: János Benjamin Antal --- docs/en/sql-reference/aggregate-functions/reference/argmax.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmax.md b/docs/en/sql-reference/aggregate-functions/reference/argmax.md index 5e80131df4c..76b9e206abe 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmax.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmax.md @@ -80,7 +80,7 @@ select argMax(a, b), max(b) from test; select argMax(tuple(a), b) from test; ┌─argMax(tuple(a), b)─┐ -│ (NULL) │ -- Tuple allows to get Null value. +│ (NULL) │ -- The a `Tuple` that contains only a `NULL` value is not `NULL`, so the aggregate functions won't skip that row because of that `NULL` value └─────────────────────┘ select (argMax((a, b), b) as t).1 argMaxA, t.2 argMaxB from test; From 5308abb93a22cf19d10078e533a3b09741a713f7 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 6 Jun 2023 10:18:13 -0300 Subject: [PATCH 0541/1072] Update docs/en/sql-reference/aggregate-functions/reference/argmin.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: János Benjamin Antal --- docs/en/sql-reference/aggregate-functions/reference/argmin.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmin.md b/docs/en/sql-reference/aggregate-functions/reference/argmin.md index 067c81f56cf..304abf512da 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmin.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmin.md @@ -90,7 +90,7 @@ select (argMin((a, b), b) as t).1 argMinA, t.2 argMinB from test; select argMin(a, b), min(b) from test where a is Null and b is Null; ┌─argMin(a, b)─┬─min(b)─┐ -│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -- Nulls are not skipped because only Null values are available +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -- ll aggregated rows contains at least one `NULL` value because of the filter, so all rows are skipped, therefore the result will be `NULL` └──────────────┴────────┘ select argMin(a, (b, a)), min(tuple(b, a)) from test; From f992d10ae7af2cd7d37da65055be4f26fdfa9957 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 6 Jun 2023 10:18:22 -0300 Subject: [PATCH 0542/1072] Update docs/en/sql-reference/aggregate-functions/reference/argmin.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: János Benjamin Antal --- docs/en/sql-reference/aggregate-functions/reference/argmin.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmin.md b/docs/en/sql-reference/aggregate-functions/reference/argmin.md index 304abf512da..816dd9fcac1 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmin.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmin.md @@ -75,7 +75,7 @@ select * from test; select argMin(a, b), min(b) from test; ┌─argMin(a, b)─┬─min(b)─┐ -│ a │ 0 │ -- argMin = a because it the first not-Null value, min(b) is from another row! +│ a │ 0 │ -- argMin = a because it the first not Null value, min(b) is from another row! └──────────────┴────────┘ select argMin(tuple(a), b) from test; From 41be2fcbc7bd2227df9c506d2cbde82ded06e690 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 6 Jun 2023 10:18:34 -0300 Subject: [PATCH 0543/1072] Update docs/en/sql-reference/aggregate-functions/reference/argmin.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: János Benjamin Antal --- docs/en/sql-reference/aggregate-functions/reference/argmin.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmin.md b/docs/en/sql-reference/aggregate-functions/reference/argmin.md index 816dd9fcac1..df4b28b070c 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmin.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmin.md @@ -95,7 +95,7 @@ select argMin(a, b), min(b) from test where a is Null and b is Null; select argMin(a, (b, a)), min(tuple(b, a)) from test; ┌─argMin(a, tuple(b, a))─┬─min(tuple(b, a))─┐ -│ d │ (NULL,NULL) │ -- 'd' is the first Not null value for the min +│ d │ (NULL,NULL) │ -- 'd' is the first not Null value for the min └────────────────────────┴──────────────────┘ select argMin((a, b), (b, a)), min(tuple(b, a)) from test; From 834918dc1cef6af6eca60d57953a82b982f993aa Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 6 Jun 2023 10:19:08 -0300 Subject: [PATCH 0544/1072] Update docs/en/sql-reference/aggregate-functions/reference/argmin.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: János Benjamin Antal --- docs/en/sql-reference/aggregate-functions/reference/argmin.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmin.md b/docs/en/sql-reference/aggregate-functions/reference/argmin.md index df4b28b070c..4db656f7e54 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmin.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmin.md @@ -105,7 +105,7 @@ select argMin((a, b), (b, a)), min(tuple(b, a)) from test; select argMin(a, tuple(b)) from test; ┌─argMax(a, tuple(b))─┐ -│ b │ -- Tuple can be used in `Min` to not skip Nulls in `Min` +│ d │ -- Tuple can be used in `min` to not skip rows with Null values as b. └─────────────────────┘ ``` From 9a1f6ac719b0018aa050c0c8ad099bcd65111721 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 6 Jun 2023 10:20:42 -0300 Subject: [PATCH 0545/1072] Update argmax.md --- docs/en/sql-reference/aggregate-functions/reference/argmax.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmax.md b/docs/en/sql-reference/aggregate-functions/reference/argmax.md index 76b9e206abe..91b85bce2ff 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmax.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmax.md @@ -6,7 +6,7 @@ sidebar_position: 106 # argMax Calculates the `arg` value for a maximum `val` value. If there are several different values of `arg` for maximum values of `val`, returns the first of these values encountered. -Both parts the `arg` and the `max` behave as [aggregate functions](../index.md), they both [skip `Null`](../index.md#null-processing) during processing and return not-Null values if not-Null values are available. +Both parts the `arg` and the `max` behave as [aggregate functions](../index.md), they both [skip `Null`](../index.md#null-processing) during processing and return not Null values if not Null values are available. **Syntax** From d6ee50577a5d77887612a74bd43f7bd66a10e666 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 6 Jun 2023 10:25:32 -0300 Subject: [PATCH 0546/1072] Update argmin.md --- .../sql-reference/aggregate-functions/reference/argmin.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmin.md b/docs/en/sql-reference/aggregate-functions/reference/argmin.md index 4db656f7e54..a5208f11de6 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmin.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmin.md @@ -6,7 +6,7 @@ sidebar_position: 105 # argMin Calculates the `arg` value for a minimum `val` value. If there are several different values of `arg` for minimum values of `val`, returns the first of these values encountered. -Both parts the `arg` and the `min` behave as [aggregate functions](../index.md), they both [skip `Null`](../index.md#null-processing) during processing and return not-Null values if not-Null values are available. +Both parts the `arg` and the `min` behave as [aggregate functions](../index.md), they both [skip `Null`](../index.md#null-processing) during processing and return not Null values if not Null values are available. **Syntax** @@ -90,7 +90,7 @@ select (argMin((a, b), b) as t).1 argMinA, t.2 argMinB from test; select argMin(a, b), min(b) from test where a is Null and b is Null; ┌─argMin(a, b)─┬─min(b)─┐ -│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -- ll aggregated rows contains at least one `NULL` value because of the filter, so all rows are skipped, therefore the result will be `NULL` +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -- All aggregated rows contains at least one `NULL` value because of the filter, so all rows are skipped, therefore the result will be `NULL` └──────────────┴────────┘ select argMin(a, (b, a)), min(tuple(b, a)) from test; @@ -100,7 +100,7 @@ select argMin(a, (b, a)), min(tuple(b, a)) from test; select argMin((a, b), (b, a)), min(tuple(b, a)) from test; ┌─argMin(tuple(a, b), tuple(b, a))─┬─min(tuple(b, a))─┐ -│ (NULL,NULL) │ (NULL,NULL) │ +│ (NULL,NULL) │ (NULL,NULL) │ -- argMin returns (NULL,NULL) here because Tuple allows to don't skip Nulls and min(tuple(b, a)) in this case is minimal value for this dataset └──────────────────────────────────┴──────────────────┘ select argMin(a, tuple(b)) from test; From 496bc25bff11c023ecc7f05420889538ff59779e Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 6 Jun 2023 10:26:35 -0300 Subject: [PATCH 0547/1072] Update docs/en/sql-reference/aggregate-functions/reference/argmin.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: János Benjamin Antal --- docs/en/sql-reference/aggregate-functions/reference/argmin.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmin.md b/docs/en/sql-reference/aggregate-functions/reference/argmin.md index a5208f11de6..3787c028564 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmin.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmin.md @@ -80,7 +80,7 @@ select argMin(a, b), min(b) from test; select argMin(tuple(a), b) from test; ┌─argMin(tuple(a), b)─┐ -│ (NULL) │ -- Tuple allows to get Null value. +│ (NULL) │ -- The a `Tuple` that contains only a `NULL` value is not `NULL`, so the aggregate functions won't skip that row because of that `NULL` value └─────────────────────┘ select (argMin((a, b), b) as t).1 argMinA, t.2 argMinB from test; From e63fc91e73a5d718f3daf6383114978ce93a3b40 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 6 Jun 2023 10:30:10 -0300 Subject: [PATCH 0548/1072] Update argmax.md --- .../aggregate-functions/reference/argmax.md | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmax.md b/docs/en/sql-reference/aggregate-functions/reference/argmax.md index 91b85bce2ff..93e1fac6d67 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmax.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmax.md @@ -6,7 +6,7 @@ sidebar_position: 106 # argMax Calculates the `arg` value for a maximum `val` value. If there are several different values of `arg` for maximum values of `val`, returns the first of these values encountered. -Both parts the `arg` and the `max` behave as [aggregate functions](../index.md), they both [skip `Null`](../index.md#null-processing) during processing and return not Null values if not Null values are available. +Both parts the `arg` and the `max` behave as [aggregate functions](../index.md), they both [skip `Null`](../index.md#null-processing) during processing and return not `Null` values if not `Null` values are available. **Syntax** @@ -61,7 +61,7 @@ CREATE TABLE test ) ENGINE = Memory AS SELECT * -FROM values(('a', 1), ('b', 2), ('c', 2), (NULL, 3), (NULL, NULL), ('d', NULL)); +FROM VALUES(('a', 1), ('b', 2), ('c', 2), (NULL, 3), (NULL, NULL), ('d', NULL)); select * from test; ┌─a────┬────b─┐ @@ -73,34 +73,34 @@ select * from test; │ d │ ᴺᵁᴸᴸ │ └──────┴──────┘ -select argMax(a, b), max(b) from test; +SELECT argMax(a, b), max(b) FROM test; ┌─argMax(a, b)─┬─max(b)─┐ -│ b │ 3 │ -- argMax = b because it the first not-Null value, max(b) is from another row! +│ b │ 3 │ -- argMax = 'b' because it the first not Null value, max(b) is from another row! └──────────────┴────────┘ -select argMax(tuple(a), b) from test; +SELECT argMax(tuple(a), b) FROM test; ┌─argMax(tuple(a), b)─┐ │ (NULL) │ -- The a `Tuple` that contains only a `NULL` value is not `NULL`, so the aggregate functions won't skip that row because of that `NULL` value └─────────────────────┘ -select (argMax((a, b), b) as t).1 argMaxA, t.2 argMaxB from test; +SELECT (argMax((a, b), b) as t).1 argMaxA, t.2 argMaxB FROM test; ┌─argMaxA─┬─argMaxB─┐ │ ᴺᵁᴸᴸ │ 3 │ -- you can use Tuple and get both (all - tuple(*)) columns for the according max(b) └─────────┴─────────┘ -select argMax(a, b), max(b) from test where a is Null and b is Null; +SELECT argMax(a, b), max(b) FROM test WHERE a IS NULL AND b IS NULL; ┌─argMax(a, b)─┬─max(b)─┐ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -- All aggregated rows contains at least one `NULL` value because of the filter, so all rows are skipped, therefore the result will be `NULL` └──────────────┴────────┘ -select argMax(a, (b,a)) from test; +SELECT argMax(a, (b,a)) FROM test; ┌─argMax(a, tuple(b, a))─┐ -│ c │ -- There are two rows with b=2, Tuple in the `Max` allows to get not the first `arg` +│ c │ -- There are two rows with b=2, `Tuple` in the `Max` allows to get not the first `arg` └────────────────────────┘ -select argMax(a, tuple(b)) from test; +SELECT argMax(a, tuple(b)) FROM test; ┌─argMax(a, tuple(b))─┐ -│ b │ -- Tuple can be used in `Max` to not skip Nulls in `Max` +│ b │ -- `Tuple` can be used in `Max` to not skip Nulls in `Max` └─────────────────────┘ ``` From 67a6623a3896b4deae5d69cdf84742da5b24da3b Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 6 Jun 2023 10:32:10 -0300 Subject: [PATCH 0549/1072] Update argmin.md --- .../aggregate-functions/reference/argmin.md | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmin.md b/docs/en/sql-reference/aggregate-functions/reference/argmin.md index 3787c028564..4e549e5b04c 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmin.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmin.md @@ -6,7 +6,7 @@ sidebar_position: 105 # argMin Calculates the `arg` value for a minimum `val` value. If there are several different values of `arg` for minimum values of `val`, returns the first of these values encountered. -Both parts the `arg` and the `min` behave as [aggregate functions](../index.md), they both [skip `Null`](../index.md#null-processing) during processing and return not Null values if not Null values are available. +Both parts the `arg` and the `min` behave as [aggregate functions](../index.md), they both [skip `Null`](../index.md#null-processing) during processing and return not `Null` values if not `Null` values are available. **Syntax** @@ -61,7 +61,7 @@ CREATE TABLE test ) ENGINE = Memory AS SELECT * -FROM values((NULL, 0), ('a', 1), ('b', 2), ('c', 2), (NULL, NULL), ('d', NULL)); +FROM VALUES((NULL, 0), ('a', 1), ('b', 2), ('c', 2), (NULL, NULL), ('d', NULL)); select * from test; ┌─a────┬────b─┐ @@ -73,39 +73,39 @@ select * from test; │ d │ ᴺᵁᴸᴸ │ └──────┴──────┘ -select argMin(a, b), min(b) from test; +SELECT argMin(a, b), min(b) FROM test; ┌─argMin(a, b)─┬─min(b)─┐ -│ a │ 0 │ -- argMin = a because it the first not Null value, min(b) is from another row! +│ a │ 0 │ -- argMin = a because it the first not `NULL` value, min(b) is from another row! └──────────────┴────────┘ -select argMin(tuple(a), b) from test; +SELECT argMin(tuple(a), b) FROM test; ┌─argMin(tuple(a), b)─┐ │ (NULL) │ -- The a `Tuple` that contains only a `NULL` value is not `NULL`, so the aggregate functions won't skip that row because of that `NULL` value └─────────────────────┘ -select (argMin((a, b), b) as t).1 argMinA, t.2 argMinB from test; +SELECT (argMin((a, b), b) as t).1 argMinA, t.2 argMinB from test; ┌─argMinA─┬─argMinB─┐ -│ ᴺᵁᴸᴸ │ 0 │ -- you can use Tuple and get both (all - tuple(*)) columns for the according max(b) +│ ᴺᵁᴸᴸ │ 0 │ -- you can use `Tuple` and get both (all - tuple(*)) columns for the according max(b) └─────────┴─────────┘ -select argMin(a, b), min(b) from test where a is Null and b is Null; +SELECT argMin(a, b), min(b) FROM test WHERE a IS NULL and b IS NULL; ┌─argMin(a, b)─┬─min(b)─┐ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -- All aggregated rows contains at least one `NULL` value because of the filter, so all rows are skipped, therefore the result will be `NULL` └──────────────┴────────┘ -select argMin(a, (b, a)), min(tuple(b, a)) from test; +SELECT argMin(a, (b, a)), min(tuple(b, a)) FROM test; ┌─argMin(a, tuple(b, a))─┬─min(tuple(b, a))─┐ -│ d │ (NULL,NULL) │ -- 'd' is the first not Null value for the min +│ d │ (NULL,NULL) │ -- 'd' is the first not `NULL` value for the min └────────────────────────┴──────────────────┘ -select argMin((a, b), (b, a)), min(tuple(b, a)) from test; +SELECT argMin((a, b), (b, a)), min(tuple(b, a)) FROM test; ┌─argMin(tuple(a, b), tuple(b, a))─┬─min(tuple(b, a))─┐ -│ (NULL,NULL) │ (NULL,NULL) │ -- argMin returns (NULL,NULL) here because Tuple allows to don't skip Nulls and min(tuple(b, a)) in this case is minimal value for this dataset +│ (NULL,NULL) │ (NULL,NULL) │ -- argMin returns (NULL,NULL) here because `Tuple` allows to don't skip `NULL` and min(tuple(b, a)) in this case is minimal value for this dataset └──────────────────────────────────┴──────────────────┘ select argMin(a, tuple(b)) from test; ┌─argMax(a, tuple(b))─┐ -│ d │ -- Tuple can be used in `min` to not skip rows with Null values as b. +│ d │ -- `Tuple` can be used in `min` to not skip rows with `NULL` values as b. └─────────────────────┘ ``` From 68a9ea72dee4ff94ac061544f366ce9a10fc2053 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 6 Jun 2023 16:10:01 +0200 Subject: [PATCH 0550/1072] Revert "Merge pull request #50307 from ZhiguoZh/20230527-toyyyymm" This reverts commit 17261e52da18a739b59895e9cd97dff3e72b66b8, reversing changes made to d302eae85efc7f4c1d29993484c7d8b068e1be62. --- .../OptimizeDateFilterVisitor.cpp | 33 ++------- ..._date_filter_predicate_optimizer.reference | 69 ------------------- .../02764_date_filter_predicate_optimizer.sql | 23 ------- 3 files changed, 5 insertions(+), 120 deletions(-) diff --git a/src/Interpreters/OptimizeDateFilterVisitor.cpp b/src/Interpreters/OptimizeDateFilterVisitor.cpp index aec2dec19c8..58e1b3335f9 100644 --- a/src/Interpreters/OptimizeDateFilterVisitor.cpp +++ b/src/Interpreters/OptimizeDateFilterVisitor.cpp @@ -10,37 +10,14 @@ namespace DB { -ASTPtr generateOptimizedDateFilterAST(const String & comparator, const String & converter, const String & column, UInt64 compare_to) +ASTPtr generateOptimizedDateFilterAST(const String & comparator, const String & converter, const String & column, UInt64 year) { const DateLUTImpl & date_lut = DateLUT::instance(); - String start_date; - String end_date; + if (converter != "toYear") return {}; - if (converter == "toYear") - { - UInt64 year = compare_to; - start_date = date_lut.dateToString(date_lut.makeDayNum(year, 1, 1)); - end_date = date_lut.dateToString(date_lut.makeDayNum(year, 12, 31)); - } - else if (converter == "toYYYYMM") - { - UInt64 year = compare_to / 100; - UInt64 month = compare_to % 100; - - if (month == 0 || month > 12) return {}; - - static constexpr UInt8 days_of_month[] = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}; - - bool leap_year = (year & 3) == 0 && (year % 100 || (year % 400 == 0 && year)); - - start_date = date_lut.dateToString(date_lut.makeDayNum(year, month, 1)); - end_date = date_lut.dateToString(date_lut.makeDayNum(year, month, days_of_month[month - 1] + (leap_year && month == 2))); - } - else - { - return {}; - } + String start_date = date_lut.dateToString(date_lut.makeDayNum(year, 1, 1)); + String end_date = date_lut.dateToString(date_lut.makeDayNum(year, 12, 31)); if (comparator == "equals") { @@ -105,7 +82,7 @@ bool rewritePredicateInPlace(ASTFunction & function, ASTPtr & ast) { if (const auto * func = function.arguments->children[i]->as(); func) { - if (func->name == "toYear" || func->name == "toYYYYMM") + if (func->name == "toYear") { func_id = i; } diff --git a/tests/queries/0_stateless/02764_date_filter_predicate_optimizer.reference b/tests/queries/0_stateless/02764_date_filter_predicate_optimizer.reference index 54704fb3b3e..e5c608ddc1a 100644 --- a/tests/queries/0_stateless/02764_date_filter_predicate_optimizer.reference +++ b/tests/queries/0_stateless/02764_date_filter_predicate_optimizer.reference @@ -37,72 +37,3 @@ WHERE ((date1 >= \'1993-01-01\') AND (date1 <= \'1993-12-31\')) AND ((id >= 1) A SELECT value1 FROM t WHERE ((id >= 1) AND (id <= 3)) AND ((date1 >= \'1993-01-01\') AND (date1 <= \'1993-12-31\')) -SELECT value1 -FROM t -WHERE ((date1 >= \'1900-02-01\') AND (date1 <= \'1900-02-28\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 >= \'1992-02-01\') AND (date1 <= \'1992-02-29\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 >= \'2000-02-01\') AND (date1 <= \'2000-02-29\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE (toYYYYMM(date1) = 199300) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 >= \'1993-01-01\') AND (date1 <= \'1993-01-31\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 >= \'1993-02-01\') AND (date1 <= \'1993-02-28\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 >= \'1993-03-01\') AND (date1 <= \'1993-03-31\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 >= \'1993-04-01\') AND (date1 <= \'1993-04-30\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 >= \'1993-05-01\') AND (date1 <= \'1993-05-31\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 >= \'1993-06-01\') AND (date1 <= \'1993-06-30\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 >= \'1993-07-01\') AND (date1 <= \'1993-07-31\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 >= \'1993-08-01\') AND (date1 <= \'1993-08-31\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 >= \'1993-09-01\') AND (date1 <= \'1993-09-30\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 >= \'1993-10-01\') AND (date1 <= \'1993-10-31\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 >= \'1993-11-01\') AND (date1 <= \'1993-11-30\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 >= \'1993-12-01\') AND (date1 <= \'1993-12-31\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE (toYYYYMM(date1) = 199313) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 < \'1992-03-01\') OR (date1 > \'1992-03-31\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE (date1 < \'1992-03-01\') AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE (date1 > \'1992-03-31\') AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE (date1 <= \'1992-03-31\') AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE (date1 >= \'1992-03-01\') AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 >= \'1992-03-01\') OR ((date1 >= \'1993-01-01\') AND (date1 <= \'1993-12-31\'))) AND ((id >= 1) AND (id <= 3)) diff --git a/tests/queries/0_stateless/02764_date_filter_predicate_optimizer.sql b/tests/queries/0_stateless/02764_date_filter_predicate_optimizer.sql index a04273bbb18..563468d4f82 100644 --- a/tests/queries/0_stateless/02764_date_filter_predicate_optimizer.sql +++ b/tests/queries/0_stateless/02764_date_filter_predicate_optimizer.sql @@ -13,28 +13,5 @@ EXPLAIN SYNTAX SELECT value1, toYear(date1) as year1 FROM t WHERE year1 = 1993 A EXPLAIN SYNTAX SELECT value1 FROM t WHERE 1993 > toYear(date1) AND id BETWEEN 1 AND 3; EXPLAIN SYNTAX SELECT value1 FROM t PREWHERE toYear(date1) = 1993 WHERE id BETWEEN 1 AND 3; EXPLAIN SYNTAX SELECT value1 FROM t WHERE id BETWEEN 1 AND 3 HAVING toYear(date1) = 1993; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 190002 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 199202 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 200002 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 199300 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 199301 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 199302 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 199303 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 199304 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 199305 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 199306 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 199307 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 199308 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 199309 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 199310 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 199311 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 199312 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) = 199313 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) <> 199203 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) < 199203 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) > 199203 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) <= 199203 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYYYYMM(date1) >= 199203 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE (toYYYYMM(date1) >= 199203 OR toYear(date1) = 1993) AND id BETWEEN 1 AND 3; DROP TABLE t; From 8d2b8683125b3ba1e2ccddff078c3af9820a19af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 6 Jun 2023 16:10:22 +0200 Subject: [PATCH 0551/1072] Revert "Merge pull request #50062 from ZhiguoZh/20230511-toyear" This reverts commit 55c2dbcc2d1068dae78e7be0929b193edb23d75c, reversing changes made to 23f894b995feb4d0045ba24593bd457e39b7d11d. --- .../OptimizeDateFilterVisitor.cpp | 121 ------------------ src/Interpreters/OptimizeDateFilterVisitor.h | 20 --- src/Interpreters/TreeOptimizer.cpp | 19 --- ..._date_filter_predicate_optimizer.reference | 39 ------ .../02764_date_filter_predicate_optimizer.sql | 17 --- 5 files changed, 216 deletions(-) delete mode 100644 src/Interpreters/OptimizeDateFilterVisitor.cpp delete mode 100644 src/Interpreters/OptimizeDateFilterVisitor.h delete mode 100644 tests/queries/0_stateless/02764_date_filter_predicate_optimizer.reference delete mode 100644 tests/queries/0_stateless/02764_date_filter_predicate_optimizer.sql diff --git a/src/Interpreters/OptimizeDateFilterVisitor.cpp b/src/Interpreters/OptimizeDateFilterVisitor.cpp deleted file mode 100644 index 58e1b3335f9..00000000000 --- a/src/Interpreters/OptimizeDateFilterVisitor.cpp +++ /dev/null @@ -1,121 +0,0 @@ -#include - -#include -#include -#include -#include -#include - - -namespace DB -{ - -ASTPtr generateOptimizedDateFilterAST(const String & comparator, const String & converter, const String & column, UInt64 year) -{ - const DateLUTImpl & date_lut = DateLUT::instance(); - - if (converter != "toYear") return {}; - - String start_date = date_lut.dateToString(date_lut.makeDayNum(year, 1, 1)); - String end_date = date_lut.dateToString(date_lut.makeDayNum(year, 12, 31)); - - if (comparator == "equals") - { - return makeASTFunction("and", - makeASTFunction("greaterOrEquals", - std::make_shared(column), - std::make_shared(start_date) - ), - makeASTFunction("lessOrEquals", - std::make_shared(column), - std::make_shared(end_date) - ) - ); - } - else if (comparator == "notEquals") - { - return makeASTFunction("or", - makeASTFunction("less", - std::make_shared(column), - std::make_shared(start_date) - ), - makeASTFunction("greater", - std::make_shared(column), - std::make_shared(end_date) - ) - ); - } - else if (comparator == "less" || comparator == "greaterOrEquals") - { - return makeASTFunction(comparator, - std::make_shared(column), - std::make_shared(start_date) - ); - } - else - { - return makeASTFunction(comparator, - std::make_shared(column), - std::make_shared(end_date) - ); - } -} - -bool rewritePredicateInPlace(ASTFunction & function, ASTPtr & ast) -{ - const static std::unordered_map swap_relations = { - {"equals", "equals"}, - {"notEquals", "notEquals"}, - {"less", "greater"}, - {"greater", "less"}, - {"lessOrEquals", "greaterOrEquals"}, - {"greaterOrEquals", "lessOrEquals"}, - }; - - if (!swap_relations.contains(function.name)) return false; - - if (!function.arguments || function.arguments->children.size() != 2) return false; - - size_t func_id = function.arguments->children.size(); - - for (size_t i = 0; i < function.arguments->children.size(); i++) - { - if (const auto * func = function.arguments->children[i]->as(); func) - { - if (func->name == "toYear") - { - func_id = i; - } - } - } - - if (func_id == function.arguments->children.size()) return false; - - size_t literal_id = 1 - func_id; - const auto * literal = function.arguments->children[literal_id]->as(); - - if (!literal || literal->value.getType() != Field::Types::UInt64) return false; - - UInt64 compare_to = literal->value.get(); - String comparator = literal_id > func_id ? function.name : swap_relations.at(function.name); - - const auto * func = function.arguments->children[func_id]->as(); - const auto * column_id = func->arguments->children.at(0)->as(); - - if (!column_id) return false; - - String column = column_id->name(); - - const auto new_ast = generateOptimizedDateFilterAST(comparator, func->name, column, compare_to); - - if (!new_ast) return false; - - ast = new_ast; - return true; -} - -void OptimizeDateFilterInPlaceData::visit(ASTFunction & function, ASTPtr & ast) const -{ - rewritePredicateInPlace(function, ast); -} -} diff --git a/src/Interpreters/OptimizeDateFilterVisitor.h b/src/Interpreters/OptimizeDateFilterVisitor.h deleted file mode 100644 index 84394372901..00000000000 --- a/src/Interpreters/OptimizeDateFilterVisitor.h +++ /dev/null @@ -1,20 +0,0 @@ -#pragma once - -#include - -namespace DB -{ - -class ASTFunction; - -/// Rewrite the predicates in place -class OptimizeDateFilterInPlaceData -{ -public: - using TypeToVisit = ASTFunction; - void visit(ASTFunction & function, ASTPtr & ast) const; -}; - -using OptimizeDateFilterInPlaceMatcher = OneTypeMatcher; -using OptimizeDateFilterInPlaceVisitor = InDepthNodeVisitor; -} diff --git a/src/Interpreters/TreeOptimizer.cpp b/src/Interpreters/TreeOptimizer.cpp index 825114b20b7..c38b3c79026 100644 --- a/src/Interpreters/TreeOptimizer.cpp +++ b/src/Interpreters/TreeOptimizer.cpp @@ -25,7 +25,6 @@ #include #include #include -#include #include #include @@ -678,21 +677,6 @@ void optimizeInjectiveFunctionsInsideUniq(ASTPtr & query, ContextPtr context) RemoveInjectiveFunctionsVisitor(data).visit(query); } -void optimizeDateFilters(ASTSelectQuery * select_query) -{ - /// Predicates in HAVING clause has been moved to WHERE clause. - if (select_query->where()) - { - OptimizeDateFilterInPlaceVisitor::Data data; - OptimizeDateFilterInPlaceVisitor(data).visit(select_query->refWhere()); - } - if (select_query->prewhere()) - { - OptimizeDateFilterInPlaceVisitor::Data data; - OptimizeDateFilterInPlaceVisitor(data).visit(select_query->refPrewhere()); - } -} - void transformIfStringsIntoEnum(ASTPtr & query) { std::unordered_set function_names = {"if", "transform"}; @@ -796,9 +780,6 @@ void TreeOptimizer::apply(ASTPtr & query, TreeRewriterResult & result, tables_with_columns, result.storage_snapshot->metadata, result.storage); } - /// Rewrite date filters to avoid the calls of converters such as toYear, toYYYYMM, toISOWeek, etc. - optimizeDateFilters(select_query); - /// GROUP BY injective function elimination. optimizeGroupBy(select_query, context); diff --git a/tests/queries/0_stateless/02764_date_filter_predicate_optimizer.reference b/tests/queries/0_stateless/02764_date_filter_predicate_optimizer.reference deleted file mode 100644 index e5c608ddc1a..00000000000 --- a/tests/queries/0_stateless/02764_date_filter_predicate_optimizer.reference +++ /dev/null @@ -1,39 +0,0 @@ -SELECT value1 -FROM t -WHERE ((date1 >= \'1993-01-01\') AND (date1 <= \'1993-12-31\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 < \'1993-01-01\') OR (date1 > \'1993-12-31\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE (date1 < \'1993-01-01\') AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE (date1 > \'1993-12-31\') AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE (date1 <= \'1993-12-31\') AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE (date1 >= \'1993-01-01\') AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((date1 >= \'1993-01-01\') AND (date1 <= \'1997-12-31\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE (((date1 >= \'1993-01-01\') AND (date1 <= \'1993-12-31\')) OR ((date1 >= \'1994-01-01\') AND (date1 <= \'1994-12-31\'))) AND ((id >= 1) AND (id <= 3)) -SELECT - value1, - toYear(date1) AS year1 -FROM t -WHERE ((date1 >= \'1993-01-01\') AND (date1 <= \'1993-12-31\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE (date1 < \'1993-01-01\') AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -PREWHERE (date1 >= \'1993-01-01\') AND (date1 <= \'1993-12-31\') -WHERE ((date1 >= \'1993-01-01\') AND (date1 <= \'1993-12-31\')) AND ((id >= 1) AND (id <= 3)) -SELECT value1 -FROM t -WHERE ((id >= 1) AND (id <= 3)) AND ((date1 >= \'1993-01-01\') AND (date1 <= \'1993-12-31\')) diff --git a/tests/queries/0_stateless/02764_date_filter_predicate_optimizer.sql b/tests/queries/0_stateless/02764_date_filter_predicate_optimizer.sql deleted file mode 100644 index 563468d4f82..00000000000 --- a/tests/queries/0_stateless/02764_date_filter_predicate_optimizer.sql +++ /dev/null @@ -1,17 +0,0 @@ -DROP TABLE IF EXISTS t; -CREATE TABLE t (id UInt32, value1 String, date1 Date) ENGINE ReplacingMergeTree() ORDER BY id; - -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYear(date1) = 1993 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYear(date1) <> 1993 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYear(date1) < 1993 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYear(date1) > 1993 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYear(date1) <= 1993 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYear(date1) >= 1993 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE toYear(date1) BETWEEN 1993 AND 1997 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE (toYear(date1) = 1993 OR toYear(date1) = 1994) AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1, toYear(date1) as year1 FROM t WHERE year1 = 1993 AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE 1993 > toYear(date1) AND id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t PREWHERE toYear(date1) = 1993 WHERE id BETWEEN 1 AND 3; -EXPLAIN SYNTAX SELECT value1 FROM t WHERE id BETWEEN 1 AND 3 HAVING toYear(date1) = 1993; - -DROP TABLE t; From 1910d6580e6a6cd7ad985976ed08885b3b091219 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 6 Jun 2023 16:13:51 +0200 Subject: [PATCH 0552/1072] Add test for the reverted broken optimizations --- .../02783_date_predicate_optimizations.reference | 2 ++ .../02783_date_predicate_optimizations.sql | 13 +++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 tests/queries/0_stateless/02783_date_predicate_optimizations.reference create mode 100644 tests/queries/0_stateless/02783_date_predicate_optimizations.sql diff --git a/tests/queries/0_stateless/02783_date_predicate_optimizations.reference b/tests/queries/0_stateless/02783_date_predicate_optimizations.reference new file mode 100644 index 00000000000..cd689b93034 --- /dev/null +++ b/tests/queries/0_stateless/02783_date_predicate_optimizations.reference @@ -0,0 +1,2 @@ +2021-12-31 23:00:00 0 +2021-12-31 23:00:00 0 diff --git a/tests/queries/0_stateless/02783_date_predicate_optimizations.sql b/tests/queries/0_stateless/02783_date_predicate_optimizations.sql new file mode 100644 index 00000000000..abb13f1005e --- /dev/null +++ b/tests/queries/0_stateless/02783_date_predicate_optimizations.sql @@ -0,0 +1,13 @@ +CREATE TABLE source +( + `ts` DateTime('UTC'), + `n` Int32 +) +ENGINE = MergeTree +PARTITION BY toYYYYMM(ts) +ORDER BY tuple(); + +INSERT INTO source values ('2021-12-31 23:00:00', 0); + +SELECT * FROM source WHERE toYYYYMM(ts) = 202112; +SELECT * FROM source WHERE toYear(ts) = 2021; From 88f2f4f5fc9a0a8b400d74064c44997b2304d27c Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 6 Jun 2023 16:16:49 +0200 Subject: [PATCH 0553/1072] Added createAsyncAzureReadBuffer --- .../AzureBlobStorage/AzureObjectStorage.h | 2 + src/Storages/StorageAzure.cpp | 89 ++++++++++++------- src/Storages/StorageAzure.h | 2 + 3 files changed, 63 insertions(+), 30 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index b3cda54e752..f5918f9d598 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -127,6 +127,8 @@ public: bool isRemote() const override { return true; } + MultiVersion getClient() { return client; } + private: const String name; /// client used to access the files in the Blob Storage cloud diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index e0a1d8c514e..95d3ae95a76 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -40,6 +40,9 @@ #include #include +#include +#include + using namespace Azure::Storage::Blobs; @@ -154,11 +157,6 @@ StorageAzure::Configuration StorageAzure::getConfiguration(ASTs & engine_args, C configuration.container = checkAndGetLiteralArgument(engine_args[1], "container"); configuration.blob_path = checkAndGetLiteralArgument(engine_args[2], "blobpath"); - LOG_INFO(&Poco::Logger::get("StorageAzure"), "connection_url = {}", configuration.connection_url); - LOG_INFO(&Poco::Logger::get("StorageAzure"), "container = {}", configuration.container); - LOG_INFO(&Poco::Logger::get("StorageAzure"), "blobpath = {}", configuration.blob_path); - - auto is_format_arg = [] (const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().getAllFormats().contains(s); @@ -230,8 +228,6 @@ StorageAzure::Configuration StorageAzure::getConfiguration(ASTs & engine_args, C configuration.blobs_paths = {configuration.blob_path}; - LOG_INFO(&Poco::Logger::get("StorageAzure"), "get_format_from_file = {}", get_format_from_file); - if (configuration.format == "auto" && get_format_from_file) configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); @@ -310,7 +306,6 @@ AzureClientPtr StorageAzure::createClient(StorageAzure::Configuration configurat if (configuration.is_connection_string) { - LOG_INFO(&Poco::Logger::get("StorageAzure"), "createClient is_connection_string "); result = std::make_unique(BlobContainerClient::CreateFromConnectionString(configuration.connection_url, configuration.container)); result->CreateIfNotExists(); } @@ -416,18 +411,6 @@ StorageAzure::StorageAzure( for (const auto & key : configuration.blobs_paths) objects.emplace_back(key); - for (auto obj : objects) - { - LOG_INFO(&Poco::Logger::get("StorageAzure"), "constructor obj.remote_paths = {}", obj.remote_path); - if (object_storage->exists(obj)) - { - LOG_INFO(&Poco::Logger::get("StorageAzure"), "constructor exists obj.remote_paths = {}", obj.remote_path); -// auto read_buffer = object_storage->readObject(obj); -// LOG_INFO(&Poco::Logger::get("StorageAzure"), "constructor read size obj.remote_paths = {} , size = {}", obj.remote_path, read_buffer->getFileSize()); - } - } - - auto default_virtuals = NamesAndTypesList{ {"_path", std::make_shared(std::make_shared())}, {"_file", std::make_shared(std::make_shared())}}; @@ -1146,22 +1129,68 @@ std::unique_ptr StorageAzureSource::createAzureReadBuffer(const Stri { auto read_settings = getContext()->getReadSettings().adjustBufferSize(object_size); read_settings.enable_filesystem_cache = false; - //auto download_buffer_size = getContext()->getSettings().max_download_buffer_size; - //const bool object_too_small = object_size <= 2 * download_buffer_size; + auto download_buffer_size = getContext()->getSettings().max_download_buffer_size; + const bool object_too_small = object_size <= 2 * download_buffer_size; - ///// Create a read buffer that will prefetch the first ~1 MB of the file. - ///// When reading lots of tiny files, this prefetching almost doubles the throughput. - ///// For bigger files, parallel reading is more useful. - //if (object_too_small && read_settings.remote_fs_method == RemoteFSReadMethod::threadpool) - //{ - // LOG_TRACE(log, "Downloading object {} of size {} from S3 with initial prefetch", key, object_size); - // return object_storage->readObjects({StoredObject(key)}, read_settings, {}, object_size); - //} + // Create a read buffer that will prefetch the first ~1 MB of the file. + // When reading lots of tiny files, this prefetching almost doubles the throughput. + // For bigger files, parallel reading is more useful. + if (object_too_small && read_settings.remote_fs_method == RemoteFSReadMethod::threadpool) + { + LOG_TRACE(log, "Downloading object of size {} from Azure with initial prefetch", object_size); + return createAsyncAzureReadBuffer(key, read_settings, object_size); + } return object_storage->readObject(StoredObject(key), read_settings, {}, object_size); } +std::unique_ptr StorageAzureSource::createAsyncAzureReadBuffer( + const String & key, const ReadSettings & read_settings, size_t object_size) +{ + auto context = getContext(); + + const auto & context_settings = context->getSettingsRef(); + auto max_single_part_upload_size = context_settings.azure_max_single_part_upload_size; + auto max_single_read_retries = context_settings.azure_max_single_read_retries; + + auto read_buffer_creator = + [this, read_settings, max_single_part_upload_size, max_single_read_retries] + (const std::string & path, size_t read_until_position) -> std::unique_ptr + { + return std::make_unique( + object_storage->getClient().get(), + path, + read_settings, + max_single_part_upload_size, + max_single_read_retries, + /* use_external_buffer */true, + read_until_position); + }; + + auto azure_impl = std::make_unique( + std::move(read_buffer_creator), + StoredObjects{StoredObject{key, object_size}}, + read_settings, + /* cache_log */nullptr); + + auto modified_settings{read_settings}; + /// FIXME: Changing this setting to default value breaks something around parquet reading + modified_settings.remote_read_min_bytes_for_seek = modified_settings.remote_fs_buffer_size; + + auto & pool_reader = context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); + + auto async_reader = std::make_unique( + std::move(azure_impl), pool_reader, modified_settings, + context->getAsyncReadCounters(), context->getFilesystemReadPrefetchesLog()); + + async_reader->setReadUntilEnd(); + if (read_settings.remote_fs_prefetch) + async_reader->prefetch(DEFAULT_PREFETCH_PRIORITY); + + return async_reader; +} + } #endif diff --git a/src/Storages/StorageAzure.h b/src/Storages/StorageAzure.h index f114184c336..cf3ed5e2596 100644 --- a/src/Storages/StorageAzure.h +++ b/src/Storages/StorageAzure.h @@ -274,6 +274,8 @@ private: std::future createReaderAsync(); std::unique_ptr createAzureReadBuffer(const String & key, size_t object_size); + std::unique_ptr createAsyncAzureReadBuffer( + const String & key, const ReadSettings & read_settings, size_t object_size); }; } From 75d0f9fc97f284a47eb8b4e8b03c3089631c46d4 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 6 Jun 2023 16:32:05 +0200 Subject: [PATCH 0554/1072] Updated to use readObjects for async --- src/Storages/StorageAzure.cpp | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index 95d3ae95a76..c98430858eb 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -1148,41 +1148,9 @@ std::unique_ptr StorageAzureSource::createAzureReadBuffer(const Stri std::unique_ptr StorageAzureSource::createAsyncAzureReadBuffer( const String & key, const ReadSettings & read_settings, size_t object_size) { - auto context = getContext(); - - const auto & context_settings = context->getSettingsRef(); - auto max_single_part_upload_size = context_settings.azure_max_single_part_upload_size; - auto max_single_read_retries = context_settings.azure_max_single_read_retries; - - auto read_buffer_creator = - [this, read_settings, max_single_part_upload_size, max_single_read_retries] - (const std::string & path, size_t read_until_position) -> std::unique_ptr - { - return std::make_unique( - object_storage->getClient().get(), - path, - read_settings, - max_single_part_upload_size, - max_single_read_retries, - /* use_external_buffer */true, - read_until_position); - }; - - auto azure_impl = std::make_unique( - std::move(read_buffer_creator), - StoredObjects{StoredObject{key, object_size}}, - read_settings, - /* cache_log */nullptr); - auto modified_settings{read_settings}; - /// FIXME: Changing this setting to default value breaks something around parquet reading modified_settings.remote_read_min_bytes_for_seek = modified_settings.remote_fs_buffer_size; - - auto & pool_reader = context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); - - auto async_reader = std::make_unique( - std::move(azure_impl), pool_reader, modified_settings, - context->getAsyncReadCounters(), context->getFilesystemReadPrefetchesLog()); + auto async_reader = object_storage->readObjects(StoredObjects{StoredObject{key, object_size}}, modified_settings); async_reader->setReadUntilEnd(); if (read_settings.remote_fs_prefetch) From cbe4ea67aec5f285e3cb3fd905917679041ced0a Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 6 Jun 2023 16:42:56 +0200 Subject: [PATCH 0555/1072] Removed unwanted code & debug lines --- .../ObjectStorages/AzureBlobStorage/AzureObjectStorage.h | 2 -- src/TableFunctions/TableFunctionS3.cpp | 4 ---- 2 files changed, 6 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index f5918f9d598..b3cda54e752 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -127,8 +127,6 @@ public: bool isRemote() const override { return true; } - MultiVersion getClient() { return client; } - private: const String name; /// client used to access the files in the Blob Storage cloud diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index 7f283afd6b4..e63f32b1cbc 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -294,8 +294,6 @@ void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context) const { - LOG_INFO(&Poco::Logger::get("TableFunctionS3"), "getActualTableStructure configuration.structure = {} ",configuration.structure); - if (configuration.structure == "auto") { context->checkAccess(getSourceAccessType()); @@ -321,8 +319,6 @@ StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, Context else if (!structure_hint.empty()) columns = structure_hint; - LOG_INFO(&Poco::Logger::get("TableFunctionS3"), "executeImpl structre = {} structure_hint = {} ",configuration.structure, structure_hint.getAll().toString()); - StoragePtr storage = std::make_shared( configuration, From ebae79f7d410dd67879e041d2db4395d852bcae3 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 6 Jun 2023 16:57:51 +0200 Subject: [PATCH 0556/1072] Schema inference --- src/Core/Settings.h | 1 + src/Storages/StorageAzure.cpp | 145 ++++++++++++++++-- src/Storages/StorageAzure.h | 28 ++-- .../test_storage_azure_blob_storage/test.py | 48 ++++++ 4 files changed, 200 insertions(+), 22 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index a484e8e816d..b95dc9a26b3 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -710,6 +710,7 @@ class IColumn; \ M(Bool, schema_inference_use_cache_for_file, true, "Use cache in schema inference while using file table function", 0) \ M(Bool, schema_inference_use_cache_for_s3, true, "Use cache in schema inference while using s3 table function", 0) \ + M(Bool, schema_inference_use_cache_for_azure, true, "Use cache in schema inference while using azure table function", 0) \ M(Bool, schema_inference_use_cache_for_hdfs, true, "Use cache in schema inference while using hdfs table function", 0) \ M(Bool, schema_inference_use_cache_for_url, true, "Use cache in schema inference while using url table function", 0) \ M(Bool, schema_inference_cache_require_modification_time_for_url, true, "Use schema from cache for URL with last modification time validation (for urls with Last-Modified header)", 0) \ diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index e0a1d8c514e..4c47fe318c6 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -58,6 +58,7 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int DATABASE_ACCESS_DENIED; extern const int CANNOT_COMPILE_REGEXP; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } namespace @@ -403,7 +404,8 @@ StorageAzure::StorageAzure( StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Schema inference is not supported yet"); + auto columns = getTableStructureFromDataImpl(context); + storage_metadata.setColumns(columns); } else storage_metadata.setColumns(columns_); @@ -624,13 +626,13 @@ Pipe StorageAzure::read( /// Iterate through disclosed globs and make a source for each file iterator_wrapper = std::make_shared( object_storage.get(), configuration.container, std::nullopt, - configuration.blob_path, query_info.query, virtual_block, local_context); + configuration.blob_path, query_info.query, virtual_block, local_context, nullptr); } else { iterator_wrapper = std::make_shared( object_storage.get(), configuration.container, configuration.blobs_paths, - std::nullopt, query_info.query, virtual_block, local_context); + std::nullopt, query_info.query, virtual_block, local_context, nullptr); } ColumnsDescription columns_description; @@ -800,7 +802,8 @@ StorageAzureSource::Iterator::Iterator( std::optional blob_path_with_globs_, ASTPtr query_, const Block & virtual_header_, - ContextPtr context_) + ContextPtr context_, + RelativePathsWithMetadata * outer_blobs_) : WithContext(context_) , object_storage(object_storage_) , container(container_) @@ -808,6 +811,7 @@ StorageAzureSource::Iterator::Iterator( , blob_path_with_globs(blob_path_with_globs_) , query(query_) , virtual_header(virtual_header_) + , outer_blobs(outer_blobs_) { if (keys.has_value() && blob_path_with_globs.has_value()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot specify keys and glob simulatenously it's a bug"); @@ -854,6 +858,8 @@ StorageAzureSource::Iterator::Iterator( ObjectMetadata object_metadata = object_storage->getObjectMetadata(key); total_size += object_metadata.size_bytes; blobs_with_metadata->emplace_back(RelativePathWithMetadata{key, object_metadata}); + if (outer_blobs) + outer_blobs->emplace_back(blobs_with_metadata->back()); } } else @@ -866,6 +872,8 @@ StorageAzureSource::Iterator::Iterator( { ObjectMetadata object_metadata = object_storage->getObjectMetadata(*blob_path_with_globs); blobs_with_metadata->emplace_back(*blob_path_with_globs, object_metadata); + if (outer_blobs) + outer_blobs->emplace_back(blobs_with_metadata->back()); return; } @@ -903,37 +911,29 @@ RelativePathWithMetadata StorageAzureSource::Iterator::next() } else { - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "GLOBS IN NEXt"); if (!blobs_with_metadata || index >= blobs_with_metadata->size()) { - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "INITIALIZING BLOBS BATCH"); RelativePathsWithMetadata new_batch; while (new_batch.empty()) { if (object_storage_iterator->isValid()) { - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "ITERATOR VALID FETCHING BATCH"); new_batch = object_storage_iterator->currentBatch(); - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "BATCH SIZE {}", new_batch.size()); object_storage_iterator->nextBatch(); } else { - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "ITERATOR INVALID"); is_finished = true; return {}; } for (auto it = new_batch.begin(); it != new_batch.end();) { - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "ITERATOR FILTER {} MATCH {}", it->relative_path, re2::RE2::FullMatch(it->relative_path, *matcher)); if (!recursive && !re2::RE2::FullMatch(it->relative_path, *matcher)) it = new_batch.erase(it); else ++it; } - - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "NEW BATCH AFTER FILTEr {}", new_batch.size()); } index.store(0, std::memory_order_relaxed); @@ -958,10 +958,15 @@ RelativePathWithMetadata StorageAzureSource::Iterator::next() { total_size.fetch_add(new_batch[idx].metadata.size_bytes, std::memory_order_relaxed); blobs_with_metadata->emplace_back(std::move(new_batch[idx])); + if (outer_blobs) + outer_blobs->emplace_back(blobs_with_metadata->back()); } } else { + if (outer_blobs) + outer_blobs->insert(outer_blobs->end(), new_batch.begin(), new_batch.end()); + blobs_with_metadata = std::move(new_batch); for (const auto & [_, info] : *blobs_with_metadata) total_size.fetch_add(info.size_bytes, std::memory_order_relaxed); @@ -1161,6 +1166,122 @@ std::unique_ptr StorageAzureSource::createAzureReadBuffer(const Stri return object_storage->readObject(StoredObject(key), read_settings, {}, object_size); } +ColumnsDescription StorageAzure::getTableStructureFromDataImpl(ContextPtr ctx) +{ + RelativePathsWithMetadata read_keys; + std::shared_ptr file_iterator; + if (configuration.withGlobs()) + { + file_iterator = std::make_shared( + object_storage.get(), configuration.container, std::nullopt, + configuration.blob_path, nullptr, virtual_block, ctx, &read_keys); + } + else + { + file_iterator = std::make_shared( + object_storage.get(), configuration.container, configuration.blobs_paths, + std::nullopt, nullptr, virtual_block, ctx, &read_keys); + } + + std::optional columns_from_cache; + size_t prev_read_keys_size = read_keys.size(); + if (ctx->getSettingsRef().schema_inference_use_cache_for_azure) + columns_from_cache = tryGetColumnsFromCache(read_keys.begin(), read_keys.end(), ctx); + + ReadBufferIterator read_buffer_iterator = [&, first = true](ColumnsDescription & cached_columns) mutable -> std::unique_ptr + { + auto [key, metadata] = file_iterator->next(); + + if (key.empty()) + { + if (first) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because there are no files with provided path " + "in S3. You must specify table structure manually", configuration.format); + + return nullptr; + } + + /// S3 file iterator could get new keys after new iteration, check them in schema cache. + if (ctx->getSettingsRef().schema_inference_use_cache_for_azure && read_keys.size() > prev_read_keys_size) + { + columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end(), ctx); + prev_read_keys_size = read_keys.size(); + if (columns_from_cache) + { + cached_columns = *columns_from_cache; + return nullptr; + } + } + + first = false; + int zstd_window_log_max = static_cast(ctx->getSettingsRef().zstd_window_log_max); + return wrapReadBufferWithCompressionMethod( + object_storage->readObject(StoredObject(key), ctx->getReadSettings(), {}, metadata.size_bytes), + chooseCompressionMethod(key, configuration.compression_method), + zstd_window_log_max); + }; + + ColumnsDescription columns; + if (columns_from_cache) + columns = *columns_from_cache; + else + columns = readSchemaFromFormat(configuration.format, format_settings, read_buffer_iterator, configuration.withGlobs(), ctx); + + if (ctx->getSettingsRef().schema_inference_use_cache_for_azure) + addColumnsToCache(read_keys, columns, configuration.format, ctx); + + return columns; + +} + +std::optional StorageAzure::tryGetColumnsFromCache( + const RelativePathsWithMetadata::const_iterator & begin, + const RelativePathsWithMetadata::const_iterator & end, + const ContextPtr & ctx) +{ + auto & schema_cache = getSchemaCache(ctx); + for (auto it = begin; it < end; ++it) + { + auto get_last_mod_time = [&] -> time_t + { + return it->metadata.last_modified->epochTime(); + }; + + auto host_and_bucket = configuration.connection_url + '/' + configuration.container; + String source = host_and_bucket + '/' + it->relative_path; + auto cache_key = getKeyForSchemaCache(source, configuration.format, format_settings, ctx); + auto columns = schema_cache.tryGet(cache_key, get_last_mod_time); + if (columns) + return columns; + } + + return std::nullopt; + +} + +void StorageAzure::addColumnsToCache( + const RelativePathsWithMetadata & keys, + const ColumnsDescription & columns, + const String & format_name, + const ContextPtr & ctx) +{ + auto host_and_bucket = configuration.connection_url + '/' + configuration.container; + Strings sources; + sources.reserve(keys.size()); + std::transform(keys.begin(), keys.end(), std::back_inserter(sources), [&](const auto & elem){ return host_and_bucket + '/' + elem.relative_path; }); + auto cache_keys = getKeysForSchemaCache(sources, format_name, format_settings, ctx); + auto & schema_cache = getSchemaCache(ctx); + schema_cache.addMany(cache_keys, columns); +} + +SchemaCache & StorageAzure::getSchemaCache(const ContextPtr & ctx) +{ + static SchemaCache schema_cache(ctx->getConfigRef().getUInt("schema_inference_cache_max_elements_for_azure", DEFAULT_SCHEMA_CACHE_ELEMENTS)); + return schema_cache; +} + } diff --git a/src/Storages/StorageAzure.h b/src/Storages/StorageAzure.h index f114184c336..83bcb874efc 100644 --- a/src/Storages/StorageAzure.h +++ b/src/Storages/StorageAzure.h @@ -79,10 +79,6 @@ public: static StorageAzure::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file = true); static AzureClientPtr createClient(StorageAzure::Configuration configuration); static AzureObjectStorage::SettingsPtr createSettings(StorageAzure::Configuration configuration); - static ColumnsDescription getTableStructureFromData( - const StorageAzure::Configuration & configuration, - const std::optional & format_settings, - ContextPtr ctx); String getName() const override { @@ -127,10 +123,19 @@ private: std::optional format_settings; ASTPtr partition_by; - static ColumnsDescription getTableStructureFromDataImpl( - const Configuration & configuration, - const std::optional & format_settings, - ContextPtr ctx); + ColumnsDescription getTableStructureFromDataImpl(ContextPtr ctx); + + std::optional tryGetColumnsFromCache( + const RelativePathsWithMetadata::const_iterator & begin, + const RelativePathsWithMetadata::const_iterator & end, + const ContextPtr & ctx); + + void addColumnsToCache( + const RelativePathsWithMetadata & keys, + const ColumnsDescription & columns, + const String & format_name, + const ContextPtr & ctx); + }; @@ -147,12 +152,14 @@ public: std::optional blob_path_with_globs_, ASTPtr query_, const Block & virtual_header_, - ContextPtr context_); + ContextPtr context_, + RelativePathsWithMetadata * outer_blobs_); RelativePathWithMetadata next(); size_t getTotalSize() const; ~Iterator() = default; - private: + + private: AzureObjectStorage * object_storage; std::string container; std::optional keys; @@ -165,6 +172,7 @@ public: std::atomic total_size = 0; std::optional blobs_with_metadata; + RelativePathsWithMetadata * outer_blobs; ObjectStorageIteratorPtr object_storage_iterator; bool recursive{false}; diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 221005e414b..83a7bf71181 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -373,3 +373,51 @@ def test_storage_azure_get_gzip(cluster, extension, method): assert azure_query(node, f"SELECT sum(id) FROM {name}").splitlines() == ["565"] azure_query(node, f"DROP TABLE {name}") + + +def test_schema_inference_no_globs(cluster): + node = cluster.instances["node"] # type: ClickHouseInstance + table_format = "column1 UInt32, column2 String, column3 UInt32" + azure_query(node, f"CREATE TABLE test_schema_inference_src ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs.csv', format='CSVWithNames')") + + query = f"insert into test_schema_inference_src SELECT number, toString(number), number * number FROM numbers(1000)" + azure_query(node, query) + + azure_query(node, f"CREATE TABLE test_select_inference Engine = Azure(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs.csv')") + + print(node.query("SHOW CREATE TABLE test_select_inference")) + + query = "select sum(column1), sum(length(column2)), sum(column3), min(_file), max(_path) from test_select_inference" + assert azure_query(node, query).splitlines() == ['499500\t2890\t332833500\ttest_schema_inference_no_globs.csv\tcont/test_schema_inference_no_globs.csv'] + + +def test_schema_inference_from_globs(cluster): + node = cluster.instances["node"] + unique_prefix = random.randint(1, 10000) + node = cluster.instances["node"] # type: ClickHouseInstance + table_format = "column1 UInt32, column2 UInt32, column3 UInt32" + max_path = "" + for i in range(10): + for j in range(10): + path = "{}/{}_{}/{}.csv".format( + unique_prefix, i, random.choice(["a", "b", "c", "d"]), j + ) + max_path = max(path, max_path) + values = f"({i},{j},{i + j})" + + azure_query(node, f"CREATE TABLE test_schema_{i}_{j} ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='{path}', format='CSVWithNames')") + + query = f"insert into test_schema_{i}_{j} VALUES {values}" + azure_query(node, query) + + + azure_query(node, f"CREATE TABLE test_glob_select_inference Engine = Azure(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv')") + + print(node.query("SHOW CREATE TABLE test_glob_select_inference")) + + query = "select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from test_glob_select_inference" + assert azure_query(node, query).splitlines() == [ + "450\t450\t900\t0.csv\t{bucket}/{max_path}".format( + bucket='cont', max_path=max_path + ) + ] From 3bda231203644989f0be3edcc390271f14d72cb4 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 6 Jun 2023 15:11:21 +0000 Subject: [PATCH 0557/1072] Automatic style fix --- .../test_storage_azure_blob_storage/test.py | 27 ++++++++++++++----- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 9b71ff1a490..621af773160 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -400,17 +400,25 @@ def test_storage_azure_get_gzip(cluster, extension, method): def test_schema_inference_no_globs(cluster): node = cluster.instances["node"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 String, column3 UInt32" - azure_query(node, f"CREATE TABLE test_schema_inference_src ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs.csv', format='CSVWithNames')") + azure_query( + node, + f"CREATE TABLE test_schema_inference_src ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs.csv', format='CSVWithNames')", + ) query = f"insert into test_schema_inference_src SELECT number, toString(number), number * number FROM numbers(1000)" azure_query(node, query) - azure_query(node, f"CREATE TABLE test_select_inference Engine = Azure(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs.csv')") + azure_query( + node, + f"CREATE TABLE test_select_inference Engine = Azure(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs.csv')", + ) print(node.query("SHOW CREATE TABLE test_select_inference")) query = "select sum(column1), sum(length(column2)), sum(column3), min(_file), max(_path) from test_select_inference" - assert azure_query(node, query).splitlines() == ['499500\t2890\t332833500\ttest_schema_inference_no_globs.csv\tcont/test_schema_inference_no_globs.csv'] + assert azure_query(node, query).splitlines() == [ + "499500\t2890\t332833500\ttest_schema_inference_no_globs.csv\tcont/test_schema_inference_no_globs.csv" + ] def test_schema_inference_from_globs(cluster): @@ -427,19 +435,24 @@ def test_schema_inference_from_globs(cluster): max_path = max(path, max_path) values = f"({i},{j},{i + j})" - azure_query(node, f"CREATE TABLE test_schema_{i}_{j} ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='{path}', format='CSVWithNames')") + azure_query( + node, + f"CREATE TABLE test_schema_{i}_{j} ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='{path}', format='CSVWithNames')", + ) query = f"insert into test_schema_{i}_{j} VALUES {values}" azure_query(node, query) - - azure_query(node, f"CREATE TABLE test_glob_select_inference Engine = Azure(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv')") + azure_query( + node, + f"CREATE TABLE test_glob_select_inference Engine = Azure(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv')", + ) print(node.query("SHOW CREATE TABLE test_glob_select_inference")) query = "select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from test_glob_select_inference" assert azure_query(node, query).splitlines() == [ "450\t450\t900\t0.csv\t{bucket}/{max_path}".format( - bucket='cont', max_path=max_path + bucket="cont", max_path=max_path ) ] From 6c82ee45e2f1886219967dbadcde8e6a59bd84b1 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky <43110995+evillique@users.noreply.github.com> Date: Tue, 6 Jun 2023 18:27:16 +0200 Subject: [PATCH 0558/1072] Fix build --- src/Interpreters/tests/gtest_convertFieldToType.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Interpreters/tests/gtest_convertFieldToType.cpp b/src/Interpreters/tests/gtest_convertFieldToType.cpp index f4de36cbecc..e259418d1c3 100644 --- a/src/Interpreters/tests/gtest_convertFieldToType.cpp +++ b/src/Interpreters/tests/gtest_convertFieldToType.cpp @@ -9,7 +9,6 @@ #include #include "base/Decimal.h" #include "base/types.h" -#include "gtest/gtest.h" using namespace DB; From 38edd6c3e778aad7a5a8a38c294c0c1e340990a1 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky <43110995+evillique@users.noreply.github.com> Date: Tue, 6 Jun 2023 18:28:34 +0200 Subject: [PATCH 0559/1072] Update src/Interpreters/tests/gtest_convertFieldToType.cpp --- src/Interpreters/tests/gtest_convertFieldToType.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/tests/gtest_convertFieldToType.cpp b/src/Interpreters/tests/gtest_convertFieldToType.cpp index e259418d1c3..cda9311dcbe 100644 --- a/src/Interpreters/tests/gtest_convertFieldToType.cpp +++ b/src/Interpreters/tests/gtest_convertFieldToType.cpp @@ -56,7 +56,7 @@ TEST_P(ConvertFieldToTypeTest, convert) } // Basically, the number of seconds in a day works for UTC here -const long long int Day = 24 * 60 * 60; +const Int64 Day = 24 * 60 * 60; // 123 is arbitrary value here From 473743b49fa7f37ee869d694e9233b4678efbd98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 6 Jun 2023 18:38:32 +0200 Subject: [PATCH 0560/1072] Disable pure parallel replicas if trivial count optimization is possible (#50594) --- src/Interpreters/InterpreterSelectQuery.cpp | 116 ++++++++++-------- src/Interpreters/InterpreterSelectQuery.h | 2 + src/Planner/PlannerJoinTree.cpp | 19 ++- ...licas_trivial_count_optimization.reference | 12 ++ ...lel_replicas_trivial_count_optimization.sh | 95 ++++++++++++++ 5 files changed, 191 insertions(+), 53 deletions(-) create mode 100644 tests/queries/0_stateless/02783_parallel_replicas_trivial_count_optimization.reference create mode 100755 tests/queries/0_stateless/02783_parallel_replicas_trivial_count_optimization.sh diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index d2be48dafb3..e84a400a220 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -833,6 +833,19 @@ InterpreterSelectQuery::InterpreterSelectQuery( need_analyze_again = true; } + if (can_analyze_again + && settings.max_parallel_replicas > 1 + && settings.allow_experimental_parallel_reading_from_replicas > 0 + && settings.parallel_replicas_custom_key.value.empty() + && getTrivialCount(0).has_value()) + { + /// The query could use trivial count if it didn't use parallel replicas, so let's disable it and reanalyze + context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0)); + context->setSetting("max_parallel_replicas", UInt64{0}); + need_analyze_again = true; + LOG_TRACE(log, "Disabling parallel replicas to be able to use a trivial count optimization"); + } + if (need_analyze_again) { size_t current_query_analyze_count = context->getQueryContext()->kitchen_sink.analyze_counter.load(); @@ -2254,79 +2267,84 @@ void InterpreterSelectQuery::addPrewhereAliasActions() } } -void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum processing_stage, QueryPlan & query_plan) +/// Based on the query analysis, check if optimizing the count trivial count to use totalRows is possible +std::optional InterpreterSelectQuery::getTrivialCount(UInt64 max_parallel_replicas) { - auto & query = getSelectQuery(); const Settings & settings = context->getSettingsRef(); - - /// Optimization for trivial query like SELECT count() FROM table. bool optimize_trivial_count = syntax_analyzer_result->optimize_trivial_count - && (settings.max_parallel_replicas <= 1) + && (max_parallel_replicas <= 1) && !settings.allow_experimental_query_deduplication && !settings.empty_result_for_aggregation_by_empty_set && storage && storage->getName() != "MaterializedMySQL" && !storage->hasLightweightDeletedMask() && query_info.filter_asts.empty() - && processing_stage == QueryProcessingStage::FetchColumns && query_analyzer->hasAggregation() && (query_analyzer->aggregates().size() == 1) && typeid_cast(query_analyzer->aggregates()[0].function.get()); - if (optimize_trivial_count) + if (!optimize_trivial_count) + return {}; + + auto & query = getSelectQuery(); + if (!query.prewhere() && !query.where() && !context->getCurrentTransaction()) + { + return storage->totalRows(settings); + } + else + { + // It's possible to optimize count() given only partition predicates + SelectQueryInfo temp_query_info; + temp_query_info.query = query_ptr; + temp_query_info.syntax_analyzer_result = syntax_analyzer_result; + temp_query_info.prepared_sets = query_analyzer->getPreparedSets(); + + return storage->totalRowsByPartitionPredicate(temp_query_info, context); + } +} + +void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum processing_stage, QueryPlan & query_plan) +{ + auto & query = getSelectQuery(); + const Settings & settings = context->getSettingsRef(); + std::optional num_rows; + + /// Optimization for trivial query like SELECT count() FROM table. + if (processing_stage == QueryProcessingStage::FetchColumns && (num_rows = getTrivialCount(settings.max_parallel_replicas))) { const auto & desc = query_analyzer->aggregates()[0]; const auto & func = desc.function; - std::optional num_rows{}; + const AggregateFunctionCount & agg_count = static_cast(*func); - if (!query.prewhere() && !query.where() && !context->getCurrentTransaction()) - { - num_rows = storage->totalRows(settings); - } - else // It's possible to optimize count() given only partition predicates - { - SelectQueryInfo temp_query_info; - temp_query_info.query = query_ptr; - temp_query_info.syntax_analyzer_result = syntax_analyzer_result; - temp_query_info.prepared_sets = query_analyzer->getPreparedSets(); + /// We will process it up to "WithMergeableState". + std::vector state(agg_count.sizeOfData()); + AggregateDataPtr place = state.data(); - num_rows = storage->totalRowsByPartitionPredicate(temp_query_info, context); - } + agg_count.create(place); + SCOPE_EXIT_MEMORY_SAFE(agg_count.destroy(place)); - if (num_rows) - { - const AggregateFunctionCount & agg_count = static_cast(*func); + agg_count.set(place, *num_rows); - /// We will process it up to "WithMergeableState". - std::vector state(agg_count.sizeOfData()); - AggregateDataPtr place = state.data(); + auto column = ColumnAggregateFunction::create(func); + column->insertFrom(place); - agg_count.create(place); - SCOPE_EXIT_MEMORY_SAFE(agg_count.destroy(place)); + Block header = analysis_result.before_aggregation->getResultColumns(); + size_t arguments_size = desc.argument_names.size(); + DataTypes argument_types(arguments_size); + for (size_t j = 0; j < arguments_size; ++j) + argument_types[j] = header.getByName(desc.argument_names[j]).type; - agg_count.set(place, *num_rows); + Block block_with_count{ + {std::move(column), std::make_shared(func, argument_types, desc.parameters), desc.column_name}}; - auto column = ColumnAggregateFunction::create(func); - column->insertFrom(place); - - Block header = analysis_result.before_aggregation->getResultColumns(); - size_t arguments_size = desc.argument_names.size(); - DataTypes argument_types(arguments_size); - for (size_t j = 0; j < arguments_size; ++j) - argument_types[j] = header.getByName(desc.argument_names[j]).type; - - Block block_with_count{ - {std::move(column), std::make_shared(func, argument_types, desc.parameters), desc.column_name}}; - - auto source = std::make_shared(block_with_count); - auto prepared_count = std::make_unique(Pipe(std::move(source))); - prepared_count->setStepDescription("Optimized trivial count"); - query_plan.addStep(std::move(prepared_count)); - from_stage = QueryProcessingStage::WithMergeableState; - analysis_result.first_stage = false; - return; - } + auto source = std::make_shared(block_with_count); + auto prepared_count = std::make_unique(Pipe(std::move(source))); + prepared_count->setStepDescription("Optimized trivial count"); + query_plan.addStep(std::move(prepared_count)); + from_stage = QueryProcessingStage::WithMergeableState; + analysis_result.first_stage = false; + return; } /// Limitation on the number of columns to read. diff --git a/src/Interpreters/InterpreterSelectQuery.h b/src/Interpreters/InterpreterSelectQuery.h index e39dd675136..0739e818cd6 100644 --- a/src/Interpreters/InterpreterSelectQuery.h +++ b/src/Interpreters/InterpreterSelectQuery.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -187,6 +188,7 @@ private: void executeExtremes(QueryPlan & query_plan); void executeSubqueriesInSetsAndJoins(QueryPlan & query_plan); bool autoFinalOnQuery(ASTSelectQuery & select_query); + std::optional getTrivialCount(UInt64 max_parallel_replicas); enum class Modificator { diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index 4f091f73187..9672738ae6b 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -170,7 +170,7 @@ bool applyTrivialCountIfPossible( QueryPlan & query_plan, const TableNode & table_node, const QueryTreeNodePtr & query_tree, - const ContextPtr & query_context, + ContextMutablePtr & query_context, const Names & columns_names) { const auto & settings = query_context->getSettingsRef(); @@ -208,8 +208,7 @@ bool applyTrivialCountIfPossible( if (storage->hasLightweightDeletedMask()) return false; - if (settings.max_parallel_replicas > 1 || - settings.allow_experimental_query_deduplication + if (settings.allow_experimental_query_deduplication || settings.empty_result_for_aggregation_by_empty_set) return false; @@ -228,6 +227,18 @@ bool applyTrivialCountIfPossible( if (!num_rows) return false; + if (settings.max_parallel_replicas > 1) + { + if (!settings.parallel_replicas_custom_key.value.empty() || settings.allow_experimental_parallel_reading_from_replicas == 0) + return false; + + /// The query could use trivial count if it didn't use parallel replicas, so let's disable it + query_context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0)); + query_context->setSetting("max_parallel_replicas", UInt64{0}); + LOG_TRACE(&Poco::Logger::get("Planner"), "Disabling parallel replicas to be able to use a trivial count optimization"); + + } + /// Set aggregation state const AggregateFunctionCount & agg_count = *count_func; std::vector state(agg_count.sizeOfData()); @@ -619,7 +630,7 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres is_single_table_expression && table_node && select_query_info.has_aggregates && - applyTrivialCountIfPossible(query_plan, *table_node, select_query_info.query_tree, planner_context->getQueryContext(), table_expression_data.getColumnNames()); + applyTrivialCountIfPossible(query_plan, *table_node, select_query_info.query_tree, planner_context->getMutableQueryContext(), table_expression_data.getColumnNames()); if (is_trivial_count_applied) { diff --git a/tests/queries/0_stateless/02783_parallel_replicas_trivial_count_optimization.reference b/tests/queries/0_stateless/02783_parallel_replicas_trivial_count_optimization.reference new file mode 100644 index 00000000000..48795e2cd39 --- /dev/null +++ b/tests/queries/0_stateless/02783_parallel_replicas_trivial_count_optimization.reference @@ -0,0 +1,12 @@ +100000 +100000 +100000 +100000 +100000 +100000 +02783_count-default_0_disabled Not parallel 1 16 +02783_count-default_0_pure Not parallel 1 16 +02783_count-default_0_pure_analyzer Not parallel 1 16 +02783_count-default_1_disabled Not parallel 1 16 +02783_count-default_1_pure Not parallel 1 16 +02783_count-default_1_pure_analyzer Not parallel 1 16 diff --git a/tests/queries/0_stateless/02783_parallel_replicas_trivial_count_optimization.sh b/tests/queries/0_stateless/02783_parallel_replicas_trivial_count_optimization.sh new file mode 100755 index 00000000000..4c29e513183 --- /dev/null +++ b/tests/queries/0_stateless/02783_parallel_replicas_trivial_count_optimization.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +function has_used_parallel_replicas () { + $CLICKHOUSE_CLIENT --query " + SELECT + initial_query_id, + if(count() != 2, 'Used parallel', 'Not parallel'), + sumIf(read_rows, is_initial_query) as read_rows, + sumIf(read_bytes, is_initial_query) as read_bytes + FROM system.query_log + WHERE event_date >= yesterday() and initial_query_id LIKE '$1%' + GROUP BY initial_query_id + ORDER BY min(event_time_microseconds) ASC + FORMAT TSV" +} + +function run_query_with_pure_parallel_replicas () { + $CLICKHOUSE_CLIENT \ + --query "$2" \ + --query_id "${1}_disabled" \ + --max_parallel_replicas 0 + + $CLICKHOUSE_CLIENT \ + --query "$2" \ + --query_id "${1}_pure" \ + --max_parallel_replicas 3 \ + --prefer_localhost_replica 1 \ + --use_hedged_requests 0 \ + --cluster_for_parallel_replicas 'test_cluster_one_shard_three_replicas_localhost' \ + --allow_experimental_parallel_reading_from_replicas 1 \ + --allow_experimental_analyzer 0 + + # Not implemented yet + $CLICKHOUSE_CLIENT \ + --query "$2" \ + --query_id "${1}_pure_analyzer" \ + --max_parallel_replicas 3 \ + --prefer_localhost_replica 1 \ + --use_hedged_requests 0 \ + --cluster_for_parallel_replicas 'test_cluster_one_shard_three_replicas_localhost' \ + --allow_experimental_parallel_reading_from_replicas 1 \ + --allow_experimental_analyzer 1 +} + +function run_query_with_custom_key_parallel_replicas () { + $CLICKHOUSE_CLIENT \ + --query "$2" \ + --query_id "${1}_disabled" \ + --max_parallel_replicas 0 + + $CLICKHOUSE_CLIENT \ + --query "$2" \ + --query_id "${1}_custom_key" \ + --max_parallel_replicas 3 \ + --use_hedged_requests 0 \ + --parallel_replicas_custom_key_filter_type 'default' \ + --parallel_replicas_custom_key "$2" \ + --allow_experimental_analyzer 0 + + $CLICKHOUSE_CLIENT \ + --query "$2" \ + --query_id "${1}_custom_key_analyzer" \ + --max_parallel_replicas 3 \ + --use_hedged_requests 0 \ + --parallel_replicas_custom_key_filter_type 'default' \ + --parallel_replicas_custom_key "$2" \ + --allow_experimental_analyzer 1 +} + +$CLICKHOUSE_CLIENT --query " + CREATE TABLE replicated_numbers + ( + number Int64, + ) + ENGINE=ReplicatedMergeTree('/clickhouse/tables/{database}/replicated_numbers', 'r1') + ORDER BY (number) + AS SELECT number FROM numbers(100000); +" + +query_id_base="02783_count-$CLICKHOUSE_DATABASE" + +run_query_with_pure_parallel_replicas "${query_id_base}_0" "SELECT count() FROM replicated_numbers" +run_query_with_pure_parallel_replicas "${query_id_base}_1" "SELECT * FROM (SELECT count() FROM replicated_numbers) LIMIT 20" + +# Not implemented yet as the query fails to execute correctly to begin with +#run_query_with_custom_key_parallel_replicas "${query_id_base}_2" "SELECT count() FROM cluster(test_cluster_one_shard_three_replicas_localhost, currentDatabase(), replicated_numbers)" "sipHash64(number)" +#run_query_with_custom_key_parallel_replicas "${query_id_base}_3" "SELECT * FROM (SELECT count() FROM cluster(test_cluster_one_shard_three_replicas_localhost, currentDatabase(), replicated_numbers)) LIMIT 20" "sipHash64(number)" + + +$CLICKHOUSE_CLIENT --query "SYSTEM FLUSH LOGS" +has_used_parallel_replicas "${query_id_base}" From c910f0034b20fcdbe3c336d6851fdd1c7bb21138 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 6 Jun 2023 18:48:20 +0200 Subject: [PATCH 0561/1072] Some code for table function --- src/Storages/StorageAzure.cpp | 52 ++++--- src/Storages/StorageAzure.h | 22 ++- src/TableFunctions/TableFunctionAzure.cpp | 175 ++++++++++++++++++++-- src/TableFunctions/TableFunctionAzure.h | 8 +- 4 files changed, 214 insertions(+), 43 deletions(-) diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index e725aa17dd6..e3051236118 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -67,12 +67,12 @@ namespace ErrorCodes namespace { -static const std::unordered_set required_configuration_keys = { +const std::unordered_set required_configuration_keys = { "blob_path", "container", }; -static const std::unordered_set optional_configuration_keys = { +const std::unordered_set optional_configuration_keys = { "format", "compression", "compression_method", @@ -87,8 +87,9 @@ bool isConnectionString(const std::string & candidate) return candidate.starts_with("DefaultEndpointsProtocol"); } +} -void processNamedCollectionResult(StorageAzure::Configuration & configuration, const NamedCollection & collection) +void StorageAzure::processNamedCollectionResult(StorageAzure::Configuration & configuration, const NamedCollection & collection) { validateNamedCollection(collection, required_configuration_keys, optional_configuration_keys); @@ -113,11 +114,11 @@ void processNamedCollectionResult(StorageAzure::Configuration & configuration, c if (collection.has("account_key")) configuration.account_key = collection.get("account_key"); + configuration.structure = collection.getOrDefault("structure", "auto"); configuration.format = collection.getOrDefault("format", configuration.format); configuration.compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); } -} StorageAzure::Configuration StorageAzure::getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file) { @@ -236,6 +237,17 @@ StorageAzure::Configuration StorageAzure::getConfiguration(ASTs & engine_args, C } +AzureObjectStorage::SettingsPtr StorageAzure::createSettings(ContextPtr local_context) +{ + const auto & context_settings = local_context->getSettingsRef(); + auto settings_ptr = std::make_unique(); + settings_ptr->max_single_part_upload_size = context_settings.azure_max_single_part_upload_size; + settings_ptr->max_single_read_retries = context_settings.azure_max_single_read_retries; + settings_ptr->list_object_keys_size = static_cast(context_settings.azure_list_object_keys_size); + + return settings_ptr; +} + void registerStorageAzure(StorageFactory & factory) { factory.registerStorage("Azure", [](const StorageFactory::Arguments & args) @@ -276,11 +288,7 @@ void registerStorageAzure(StorageFactory & factory) if (args.storage_def->partition_by) partition_by = args.storage_def->partition_by->clone(); - const auto & context_settings = args.getContext()->getSettingsRef(); - auto settings = std::make_unique(); - settings->max_single_part_upload_size = context_settings.azure_max_single_part_upload_size; - settings->max_single_read_retries = context_settings.azure_max_single_read_retries; - settings->list_object_keys_size = static_cast(context_settings.azure_list_object_keys_size); + auto settings = StorageAzure::createSettings(args.getContext()); return std::make_shared( std::move(configuration), @@ -399,7 +407,7 @@ StorageAzure::StorageAzure( StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = getTableStructureFromDataImpl(context); + auto columns = getTableStructureFromData(object_storage.get(), configuration, format_settings, context); storage_metadata.setColumns(columns); } else @@ -1149,27 +1157,31 @@ std::unique_ptr StorageAzureSource::createAzureReadBuffer(const Stri return object_storage->readObject(StoredObject(key), read_settings, {}, object_size); } -ColumnsDescription StorageAzure::getTableStructureFromDataImpl(ContextPtr ctx) +ColumnsDescription StorageAzure::getTableStructureFromData( + AzureObjectStorage * object_storage, + const Configuration & configuration, + const std::optional & format_settings, + ContextPtr ctx) { RelativePathsWithMetadata read_keys; std::shared_ptr file_iterator; if (configuration.withGlobs()) { file_iterator = std::make_shared( - object_storage.get(), configuration.container, std::nullopt, - configuration.blob_path, nullptr, virtual_block, ctx, &read_keys); + object_storage, configuration.container, std::nullopt, + configuration.blob_path, nullptr, Block{}, ctx, &read_keys); } else { file_iterator = std::make_shared( - object_storage.get(), configuration.container, configuration.blobs_paths, - std::nullopt, nullptr, virtual_block, ctx, &read_keys); + object_storage, configuration.container, configuration.blobs_paths, + std::nullopt, nullptr, Block{}, ctx, &read_keys); } std::optional columns_from_cache; size_t prev_read_keys_size = read_keys.size(); if (ctx->getSettingsRef().schema_inference_use_cache_for_azure) - columns_from_cache = tryGetColumnsFromCache(read_keys.begin(), read_keys.end(), ctx); + columns_from_cache = tryGetColumnsFromCache(read_keys.begin(), read_keys.end(), configuration, format_settings, ctx); ReadBufferIterator read_buffer_iterator = [&, first = true](ColumnsDescription & cached_columns) mutable -> std::unique_ptr { @@ -1189,7 +1201,7 @@ ColumnsDescription StorageAzure::getTableStructureFromDataImpl(ContextPtr ctx) /// S3 file iterator could get new keys after new iteration, check them in schema cache. if (ctx->getSettingsRef().schema_inference_use_cache_for_azure && read_keys.size() > prev_read_keys_size) { - columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end(), ctx); + columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end(), configuration, format_settings, ctx); prev_read_keys_size = read_keys.size(); if (columns_from_cache) { @@ -1213,7 +1225,7 @@ ColumnsDescription StorageAzure::getTableStructureFromDataImpl(ContextPtr ctx) columns = readSchemaFromFormat(configuration.format, format_settings, read_buffer_iterator, configuration.withGlobs(), ctx); if (ctx->getSettingsRef().schema_inference_use_cache_for_azure) - addColumnsToCache(read_keys, columns, configuration.format, ctx); + addColumnsToCache(read_keys, columns, configuration, format_settings, configuration.format, ctx); return columns; @@ -1222,6 +1234,8 @@ ColumnsDescription StorageAzure::getTableStructureFromDataImpl(ContextPtr ctx) std::optional StorageAzure::tryGetColumnsFromCache( const RelativePathsWithMetadata::const_iterator & begin, const RelativePathsWithMetadata::const_iterator & end, + const StorageAzure::Configuration & configuration, + const std::optional & format_settings, const ContextPtr & ctx) { auto & schema_cache = getSchemaCache(ctx); @@ -1247,6 +1261,8 @@ std::optional StorageAzure::tryGetColumnsFromCache( void StorageAzure::addColumnsToCache( const RelativePathsWithMetadata & keys, const ColumnsDescription & columns, + const StorageAzure::Configuration & configuration, + const std::optional & format_settings, const String & format_name, const ContextPtr & ctx) { diff --git a/src/Storages/StorageAzure.h b/src/Storages/StorageAzure.h index f20da74c2a8..8341026b624 100644 --- a/src/Storages/StorageAzure.h +++ b/src/Storages/StorageAzure.h @@ -10,6 +10,7 @@ #include #include #include +#include namespace DB { @@ -31,6 +32,7 @@ using AzureCredentials = std::variant; @@ -78,7 +80,10 @@ public: static StorageAzure::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file = true); static AzureClientPtr createClient(StorageAzure::Configuration configuration); - static AzureObjectStorage::SettingsPtr createSettings(StorageAzure::Configuration configuration); + + static AzureObjectStorage::SettingsPtr createSettings(ContextPtr local_context); + + static void processNamedCollectionResult(StorageAzure::Configuration & configuration, const NamedCollection & collection); String getName() const override { @@ -112,6 +117,12 @@ public: static SchemaCache & getSchemaCache(const ContextPtr & ctx); + static ColumnsDescription getTableStructureFromData( + AzureObjectStorage * object_storage, + const Configuration & configuration, + const std::optional & format_settings, + ContextPtr ctx); + private: std::string name; Configuration configuration; @@ -123,16 +134,19 @@ private: std::optional format_settings; ASTPtr partition_by; - ColumnsDescription getTableStructureFromDataImpl(ContextPtr ctx); - std::optional tryGetColumnsFromCache( + static std::optional tryGetColumnsFromCache( const RelativePathsWithMetadata::const_iterator & begin, const RelativePathsWithMetadata::const_iterator & end, + const StorageAzure::Configuration & configuration, + const std::optional & format_settings, const ContextPtr & ctx); - void addColumnsToCache( + static void addColumnsToCache( const RelativePathsWithMetadata & keys, const ColumnsDescription & columns, + const Configuration & configuration, + const std::optional & format_settings, const String & format_name, const ContextPtr & ctx); diff --git a/src/TableFunctions/TableFunctionAzure.cpp b/src/TableFunctions/TableFunctionAzure.cpp index f565a365a13..ac3fa1cc8af 100644 --- a/src/TableFunctions/TableFunctionAzure.cpp +++ b/src/TableFunctions/TableFunctionAzure.cpp @@ -36,20 +36,148 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } - -void TableFunctionAzure::parseArgumentsImpl(ASTs & args, const ContextPtr & context) +namespace { - if (args.size() != 5) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "The signature of table function {} shall be the following:\n{}", getName(), getSignature()); - for (auto & arg : args) - arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); +bool isConnectionString(const std::string & candidate) +{ + return candidate.starts_with("DefaultEndpointsProtocol"); +} - configuration.connection_url = checkAndGetLiteralArgument(args[0], "connection_url"); - configuration.container = checkAndGetLiteralArgument(args[1], "container"); - configuration.blob_path = checkAndGetLiteralArgument(args[2], "blob_path"); - configuration.format = checkAndGetLiteralArgument(args[3], "format"); - configuration.structure = checkAndGetLiteralArgument(args[4], "structure"); +} + +StorageAzure::Configuration TableFunctionAzure::parseArgumentsImpl(ASTs & engine_args, const ContextPtr & local_context, bool get_format_from_file) +{ + StorageAzure::Configuration configuration; + + /// Supported signatures: + /// + /// Azure(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure]) + /// + + if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) + { + StorageAzure::processNamedCollectionResult(configuration, *named_collection); + + configuration.blobs_paths = {configuration.blob_path}; + + if (configuration.format == "auto" && get_format_from_file) + configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + + return configuration; + } + + if (engine_args.size() < 3 || engine_args.size() > 8) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Storage Azure requires 3 to 7 arguments: " + "Azure(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])"); + + for (auto & engine_arg : engine_args) + engine_arg = evaluateConstantExpressionOrIdentifierAsLiteral(engine_arg, local_context); + + std::unordered_map engine_args_to_idx; + + configuration.connection_url = checkAndGetLiteralArgument(engine_args[0], "connection_string/storage_account_url"); + configuration.is_connection_string = isConnectionString(configuration.connection_url); + + configuration.container = checkAndGetLiteralArgument(engine_args[1], "container"); + configuration.blob_path = checkAndGetLiteralArgument(engine_args[2], "blobpath"); + + auto is_format_arg = [] (const std::string & s) -> bool + { + return s == "auto" || FormatFactory::instance().getAllFormats().contains(s); + }; + + if (engine_args.size() == 4) + { + auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name/structure"); + if (is_format_arg(fourth_arg)) + { + configuration.format = fourth_arg; + } + else + { + configuration.structure = fourth_arg; + } + } + else if (engine_args.size() == 5) + { + auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); + if (is_format_arg(fourth_arg)) + { + configuration.format = fourth_arg; + configuration.compression_method = checkAndGetLiteralArgument(engine_args[4], "compression"); + } + else + { + configuration.account_name = fourth_arg; + configuration.account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); + } + } + else if (engine_args.size() == 6) + { + auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); + if (is_format_arg(fourth_arg)) + { + configuration.format = fourth_arg; + configuration.compression_method = checkAndGetLiteralArgument(engine_args[4], "compression"); + configuration.structure = checkAndGetLiteralArgument(engine_args[5], "structure"); + } + else + { + configuration.account_name = fourth_arg; + configuration.account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); + auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name"); + if (!is_format_arg(sixth_arg)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); + configuration.format = sixth_arg; + } + } + else if (engine_args.size() == 7) + { + auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); + if (is_format_arg(fourth_arg)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format, compression and structure must be last arguments"); + } + else + { + configuration.account_name = fourth_arg; + configuration.account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); + auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name"); + if (!is_format_arg(sixth_arg)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); + configuration.format = sixth_arg; + configuration.compression_method = checkAndGetLiteralArgument(engine_args[6], "compression"); + } + } + else if (engine_args.size() == 8) + { + + auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); + if (is_format_arg(fourth_arg)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format and compression must be last arguments"); + } + else + { + configuration.account_name = fourth_arg; + configuration.account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); + auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name"); + if (!is_format_arg(sixth_arg)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); + configuration.format = sixth_arg; + configuration.compression_method = checkAndGetLiteralArgument(engine_args[6], "compression"); + configuration.structure = checkAndGetLiteralArgument(engine_args[7], "structure"); + } + } + + configuration.blobs_paths = {configuration.blob_path}; + + if (configuration.format == "auto" && get_format_from_file) + configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + + return configuration; } void TableFunctionAzure::parseArguments(const ASTPtr & ast_function, ContextPtr context) @@ -69,6 +197,16 @@ void TableFunctionAzure::parseArguments(const ASTPtr & ast_function, ContextPtr ColumnsDescription TableFunctionAzure::getActualTableStructure(ContextPtr context) const { + if (configuration.structure == "auto") + { + context->checkAccess(getSourceAccessType()); + auto client = StorageAzure::createClient(configuration); + auto settings = StorageAzure::createSettings(context); + + auto object_storage = std::make_unique("AzureTableFunction", std::move(client), std::move(settings)); + return StorageAzure::getTableStructureFromData(object_storage.get(), configuration, std::nullopt, context); + } + return parseColumnsListFromString(configuration.structure, context); } @@ -79,24 +217,29 @@ bool TableFunctionAzure::supportsReadingSubsetOfColumns() StoragePtr TableFunctionAzure::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const { - ColumnsDescription columns; - columns = parseColumnsListFromString(configuration.structure, context); - configuration.is_connection_string = true; configuration.blobs_paths = {configuration.blob_path}; auto client = StorageAzure::createClient(configuration); + auto settings = StorageAzure::createSettings(context); + + ColumnsDescription columns; + if (configuration.structure != "auto") + columns = parseColumnsListFromString(configuration.structure, context); + else if (!structure_hint.empty()) + columns = structure_hint; StoragePtr storage = std::make_shared( configuration, - std::make_unique(table_name, std::move(client), std::make_unique()), + std::make_unique(table_name, std::move(client), std::move(settings)), context, StorageID(getDatabaseName(), table_name), columns, ConstraintsDescription{}, String{}, /// No format_settings for table function Azure - std::nullopt, nullptr); + std::nullopt, + nullptr); storage->startup(); diff --git a/src/TableFunctions/TableFunctionAzure.h b/src/TableFunctions/TableFunctionAzure.h index a6fb5415113..b3508e7f95c 100644 --- a/src/TableFunctions/TableFunctionAzure.h +++ b/src/TableFunctions/TableFunctionAzure.h @@ -19,9 +19,9 @@ class TableFunctionAzure : public ITableFunction { public: static constexpr auto name = "azure_blob"; - static constexpr auto signature = "- connection_url, container, blob, format, structure\n"; + static constexpr auto signature = "- connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure]\n"; - static size_t getMaxNumberOfArguments() { return 5; } + static size_t getMaxNumberOfArguments() { return 8; } String getName() const override { @@ -46,9 +46,7 @@ public: return {"_path", "_file"}; } - virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context); - - static void addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context); + static StorageAzure::Configuration parseArgumentsImpl(ASTs & args, const ContextPtr & context, bool get_format_from_file = true); protected: From e76a7022fcc26f02cb98214e72f039b12b04423f Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 6 Jun 2023 18:50:45 +0200 Subject: [PATCH 0562/1072] Add some tests --- .../test_storage_azure_blob_storage/test.py | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 9b71ff1a490..d4a5f6e24bb 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -443,3 +443,46 @@ def test_schema_inference_from_globs(cluster): bucket='cont', max_path=max_path ) ] + +def test_simple_write_account_string_table_function(cluster): + node = cluster.instances["node"] + azure_query(node, "INSERT INTO azure_blob('http://azurite1:10000/devstoreaccount1', 'cont', 'test_simple_write_tf.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')") + print(get_azure_file_content("test_simple_write_tf.csv")) + assert get_azure_file_content("test_simple_write_tf.csv") == '1,"a"\n' + + +def test_simple_write_connection_string_table_function(cluster): + node = cluster.instances["node"] + azure_query( + node, + "CREATE TABLE test_simple_write_connection_string (key UInt64, data String) Engine = Azure('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1;', 'cont', 'test_simple_write_c.csv', 'CSV')", + ) + azure_query(node, "INSERT INTO test_simple_write_connection_string VALUES (1, 'a')") + print(get_azure_file_content("test_simple_write_c.csv")) + assert get_azure_file_content("test_simple_write_c.csv") == '1,"a"\n' + + +def test_simple_write_named_collection_1_table_function(cluster): + node = cluster.instances["node"] + azure_query( + node, + "CREATE TABLE test_simple_write_named_collection_1 (key UInt64, data String) Engine = Azure(azure_conf1)", + ) + azure_query( + node, "INSERT INTO test_simple_write_named_collection_1 VALUES (1, 'a')" + ) + print(get_azure_file_content("test_simple_write_named.csv")) + assert get_azure_file_content("test_simple_write_named.csv") == '1,"a"\n' + + +def test_simple_write_named_collection_2_table_function(cluster): + node = cluster.instances["node"] + azure_query( + node, + "CREATE TABLE test_simple_write_named_collection_2 (key UInt64, data String) Engine = Azure(azure_conf2, container='cont', blob_path='test_simple_write_named_2.csv', format='CSV')", + ) + azure_query( + node, "INSERT INTO test_simple_write_named_collection_2 VALUES (1, 'a')" + ) + print(get_azure_file_content("test_simple_write_named_2.csv")) + assert get_azure_file_content("test_simple_write_named_2.csv") == '1,"a"\n' From 2a756a7e6c7d564aae6e4895b53e4b2d6f18cfbd Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 6 Jun 2023 17:05:23 +0000 Subject: [PATCH 0563/1072] Cosmetics: Make default tree count / distance function constants --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index b15b1bb1a91..e6c11c839fe 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -314,12 +314,14 @@ MergeTreeIndexConditionPtr MergeTreeIndexAnnoy::createIndexCondition(const Selec MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index) { - uint64_t trees = 100; - String distance_function = "L2Distance"; + static constexpr auto default_trees = 100uz; + static constexpr auto default_distance_function = "L2Distance"; + String distance_function = default_distance_function; if (!index.arguments.empty()) distance_function = index.arguments[0].get(); + uint64_t trees = default_trees; if (index.arguments.size() > 1) trees = index.arguments[1].get(); From a74d3ca3289806eccf6cd66ac29725bc3655bbe8 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 6 Jun 2023 17:06:47 +0000 Subject: [PATCH 0564/1072] Fix style check --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index e6c11c839fe..1a28f28f746 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -23,7 +23,6 @@ namespace ErrorCodes extern const int INCORRECT_NUMBER_OF_COLUMNS; extern const int INCORRECT_QUERY; extern const int LOGICAL_ERROR; - extern const int BAD_ARGUMENTS; } From ae97f45c1c9bc8e3f55c01fc57bab5179964b36b Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 6 Jun 2023 17:10:48 +0000 Subject: [PATCH 0565/1072] Automatic style fix --- tests/integration/test_storage_azure_blob_storage/test.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 23d1b44daf2..b431837bccb 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -457,9 +457,13 @@ def test_schema_inference_from_globs(cluster): ) ] + def test_simple_write_account_string_table_function(cluster): node = cluster.instances["node"] - azure_query(node, "INSERT INTO azure_blob('http://azurite1:10000/devstoreaccount1', 'cont', 'test_simple_write_tf.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')") + azure_query( + node, + "INSERT INTO azure_blob('http://azurite1:10000/devstoreaccount1', 'cont', 'test_simple_write_tf.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')", + ) print(get_azure_file_content("test_simple_write_tf.csv")) assert get_azure_file_content("test_simple_write_tf.csv") == '1,"a"\n' From 6a96cf441e24a5994aa38839ea69dad60b2ecd83 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 6 Jun 2023 19:19:17 +0200 Subject: [PATCH 0566/1072] Renamed to azure_blob_storage --- tests/integration/test_storage_azure_blob_storage/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index b431837bccb..b81730fb24a 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -462,7 +462,7 @@ def test_simple_write_account_string_table_function(cluster): node = cluster.instances["node"] azure_query( node, - "INSERT INTO azure_blob('http://azurite1:10000/devstoreaccount1', 'cont', 'test_simple_write_tf.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')", + "INSERT INTO azure_blob_storage('http://azurite1:10000/devstoreaccount1', 'cont', 'test_simple_write_tf.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')", ) print(get_azure_file_content("test_simple_write_tf.csv")) assert get_azure_file_content("test_simple_write_tf.csv") == '1,"a"\n' From b2db6b47896c03aa728c4fe66d9271eb1a34c529 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 6 Jun 2023 19:19:56 +0200 Subject: [PATCH 0567/1072] Renamed to azure_blob_storage --- src/TableFunctions/TableFunctionAzure.cpp | 2 +- src/TableFunctions/TableFunctionAzure.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TableFunctions/TableFunctionAzure.cpp b/src/TableFunctions/TableFunctionAzure.cpp index ac3fa1cc8af..27137bda9ff 100644 --- a/src/TableFunctions/TableFunctionAzure.cpp +++ b/src/TableFunctions/TableFunctionAzure.cpp @@ -251,7 +251,7 @@ void registerTableFunctionAzure(TableFunctionFactory & factory) factory.registerFunction( {.documentation = {.description=R"(The table function can be used to read the data stored on Azure Blob Storage.)", - .examples{{"azure_blob", "SELECT * FROM azure_blob(connection, container, blob_path, format, structure)", ""}}}, + .examples{{"azure_blob_storage", "SELECT * FROM azure_blob_storage(connection, container, blob_path, format, structure)", ""}}}, .allow_readonly = false}); } diff --git a/src/TableFunctions/TableFunctionAzure.h b/src/TableFunctions/TableFunctionAzure.h index b3508e7f95c..e2815973010 100644 --- a/src/TableFunctions/TableFunctionAzure.h +++ b/src/TableFunctions/TableFunctionAzure.h @@ -18,7 +18,7 @@ class Context; class TableFunctionAzure : public ITableFunction { public: - static constexpr auto name = "azure_blob"; + static constexpr auto name = "azure_blob_storage"; static constexpr auto signature = "- connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure]\n"; static size_t getMaxNumberOfArguments() { return 8; } From 7100bc51526c5d88cf3aa84b932ede4e9f1ad15f Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 6 Jun 2023 19:31:50 +0200 Subject: [PATCH 0568/1072] Fixes for azure table function --- src/Storages/StorageAzure.cpp | 3 +- src/TableFunctions/TableFunctionAzure.cpp | 7 +- .../configs/named_collections.xml | 1 + .../test_storage_azure_blob_storage/test.py | 107 ++++++++++++++---- 4 files changed, 89 insertions(+), 29 deletions(-) diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index e3051236118..c6001f59b6f 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -75,6 +75,7 @@ const std::unordered_set required_configuration_keys = { const std::unordered_set optional_configuration_keys = { "format", "compression", + "structure", "compression_method", "account_name", "account_key", @@ -1193,7 +1194,7 @@ ColumnsDescription StorageAzure::getTableStructureFromData( throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file, because there are no files with provided path " - "in S3. You must specify table structure manually", configuration.format); + "in AzureBlobStorage. You must specify table structure manually", configuration.format); return nullptr; } diff --git a/src/TableFunctions/TableFunctionAzure.cpp b/src/TableFunctions/TableFunctionAzure.cpp index ac3fa1cc8af..8b18ed42d71 100644 --- a/src/TableFunctions/TableFunctionAzure.cpp +++ b/src/TableFunctions/TableFunctionAzure.cpp @@ -78,6 +78,7 @@ StorageAzure::Configuration TableFunctionAzure::parseArgumentsImpl(ASTs & engine std::unordered_map engine_args_to_idx; configuration.connection_url = checkAndGetLiteralArgument(engine_args[0], "connection_string/storage_account_url"); + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "CONFIGURATION {}", configuration.connection_url); configuration.is_connection_string = isConnectionString(configuration.connection_url); configuration.container = checkAndGetLiteralArgument(engine_args[1], "container"); @@ -192,7 +193,8 @@ void TableFunctionAzure::parseArguments(const ASTPtr & ast_function, ContextPtr auto & args = args_func.at(0)->children; - parseArgumentsImpl(args, context); + configuration = parseArgumentsImpl(args, context); + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "CONFIGURATION {}", configuration.connection_url); } ColumnsDescription TableFunctionAzure::getActualTableStructure(ContextPtr context) const @@ -217,9 +219,6 @@ bool TableFunctionAzure::supportsReadingSubsetOfColumns() StoragePtr TableFunctionAzure::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const { - configuration.is_connection_string = true; - configuration.blobs_paths = {configuration.blob_path}; - auto client = StorageAzure::createClient(configuration); auto settings = StorageAzure::createSettings(context); diff --git a/tests/integration/test_storage_azure_blob_storage/configs/named_collections.xml b/tests/integration/test_storage_azure_blob_storage/configs/named_collections.xml index dc70895bc05..e0c18d11940 100644 --- a/tests/integration/test_storage_azure_blob_storage/configs/named_collections.xml +++ b/tests/integration/test_storage_azure_blob_storage/configs/named_collections.xml @@ -4,6 +4,7 @@ DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1; cont test_simple_write_named.csv + key UInt64, data String CSV diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 23d1b44daf2..ad96d2d304a 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -459,43 +459,102 @@ def test_schema_inference_from_globs(cluster): def test_simple_write_account_string_table_function(cluster): node = cluster.instances["node"] - azure_query(node, "INSERT INTO azure_blob('http://azurite1:10000/devstoreaccount1', 'cont', 'test_simple_write_tf.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')") + azure_query(node, "INSERT INTO TABLE FUNCTION azure_blob('http://azurite1:10000/devstoreaccount1', 'cont', 'test_simple_write_tf.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')") print(get_azure_file_content("test_simple_write_tf.csv")) assert get_azure_file_content("test_simple_write_tf.csv") == '1,"a"\n' def test_simple_write_connection_string_table_function(cluster): node = cluster.instances["node"] - azure_query( - node, - "CREATE TABLE test_simple_write_connection_string (key UInt64, data String) Engine = Azure('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1;', 'cont', 'test_simple_write_c.csv', 'CSV')", - ) - azure_query(node, "INSERT INTO test_simple_write_connection_string VALUES (1, 'a')") - print(get_azure_file_content("test_simple_write_c.csv")) - assert get_azure_file_content("test_simple_write_c.csv") == '1,"a"\n' + azure_query(node, "INSERT INTO TABLE FUNCTION azure_blob('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1;', 'cont', 'test_simple_write_connection_tf.csv', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')") + print(get_azure_file_content("test_simple_write_connection_tf.csv")) + assert get_azure_file_content("test_simple_write_connection_tf.csv") == '1,"a"\n' def test_simple_write_named_collection_1_table_function(cluster): node = cluster.instances["node"] - azure_query( - node, - "CREATE TABLE test_simple_write_named_collection_1 (key UInt64, data String) Engine = Azure(azure_conf1)", - ) - azure_query( - node, "INSERT INTO test_simple_write_named_collection_1 VALUES (1, 'a')" - ) + azure_query(node, "INSERT INTO TABLE FUNCTION azure_blob(azure_conf1) VALUES (1, 'a')") print(get_azure_file_content("test_simple_write_named.csv")) assert get_azure_file_content("test_simple_write_named.csv") == '1,"a"\n' def test_simple_write_named_collection_2_table_function(cluster): node = cluster.instances["node"] - azure_query( - node, - "CREATE TABLE test_simple_write_named_collection_2 (key UInt64, data String) Engine = Azure(azure_conf2, container='cont', blob_path='test_simple_write_named_2.csv', format='CSV')", - ) - azure_query( - node, "INSERT INTO test_simple_write_named_collection_2 VALUES (1, 'a')" - ) - print(get_azure_file_content("test_simple_write_named_2.csv")) - assert get_azure_file_content("test_simple_write_named_2.csv") == '1,"a"\n' + + azure_query(node, "INSERT INTO TABLE FUNCTION azure_blob(azure_conf2, container='cont', blob_path='test_simple_write_named_2_tf.csv', format='CSV', structure='key UInt64, data String') VALUES (1, 'a')") + print(get_azure_file_content("test_simple_write_named_2_tf.csv")) + assert get_azure_file_content("test_simple_write_named_2_tf.csv") == '1,"a"\n' + +def test_put_get_with_globs_tf(cluster): + # type: (ClickHouseCluster) -> None + unique_prefix = random.randint(1, 10000) + node = cluster.instances["node"] # type: ClickHouseInstance + table_format = "column1 UInt32, column2 UInt32, column3 UInt32" + max_path = "" + for i in range(10): + for j in range(10): + path = "{}/{}_{}/{}.csv".format( + unique_prefix, i, random.choice(["a", "b", "c", "d"]), j + ) + max_path = max(path, max_path) + values = f"({i},{j},{i + j})" + + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azure_blob(azure_conf2, container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values}", + ) + query = f"select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from azure_blob(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv', format='CSV', structure='{table_format}')" + assert azure_query(node, query).splitlines() == [ + "450\t450\t900\t0.csv\t{bucket}/{max_path}".format( + bucket="cont", max_path=max_path + ) + ] + +def test_schema_inference_no_globs_tf(cluster): + node = cluster.instances["node"] # type: ClickHouseInstance + table_format = "column1 UInt32, column2 String, column3 UInt32" + + query = f"insert into table function azure_blob(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs_tf.csv', format='CSVWithNames', structure='{table_format}') SELECT number, toString(number), number * number FROM numbers(1000)" + azure_query(node, query) + + query = "select sum(column1), sum(length(column2)), sum(column3), min(_file), max(_path) from azure_blob(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs_tf.csv')" + assert azure_query(node, query).splitlines() == [ + "499500\t2890\t332833500\ttest_schema_inference_no_globs_tf.csv\tcont/test_schema_inference_no_globs_tf.csv" + ] + +def test_schema_inference_from_globs_tf(cluster): + node = cluster.instances["node"] + unique_prefix = random.randint(1, 10000) + node = cluster.instances["node"] # type: ClickHouseInstance + table_format = "column1 UInt32, column2 UInt32, column3 UInt32" + max_path = "" + for i in range(10): + for j in range(10): + path = "{}/{}_{}/{}.csv".format( + unique_prefix, i, random.choice(["a", "b", "c", "d"]), j + ) + max_path = max(path, max_path) + values = f"({i},{j},{i + j})" + + query = f"insert into table function azure_blob(azure_conf2, container='cont', blob_path='{path}', format='CSVWithNames', structure='{table_format}') VALUES {values}" + azure_query(node, query) + + query = f"select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from azure_blob(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv')" + assert azure_query(node, query).splitlines() == [ + "450\t450\t900\t0.csv\t{bucket}/{max_path}".format( + bucket="cont", max_path=max_path + ) + ] + +def test_partition_by_tf(cluster): + node = cluster.instances["node"] + table_format = "column1 UInt32, column2 UInt32, column3 UInt32" + partition_by = "column3" + values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)" + filename = "test_tf_{_partition_id}.csv" + + azure_query(node, f"INSERT INTO TABLE FUNCTION azure_blob('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', '{table_format}') PARTITION BY {partition_by} VALUES {values}") + + assert "1,2,3\n" == get_azure_file_content("test_tf_3.csv") + assert "3,2,1\n" == get_azure_file_content("test_tf_1.csv") + assert "78,43,45\n" == get_azure_file_content("test_tf_45.csv") From df50833b709ea862ea49f24763c74cf898b10907 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 6 Jun 2023 17:33:05 +0000 Subject: [PATCH 0569/1072] Allow to skip trailing empty lines in CSV/TSV/CustomeSeparated formats --- docs/en/interfaces/formats.md | 6 +++++- .../en/operations/settings/settings-formats.md | 18 ++++++++++++++++++ src/Core/Settings.h | 3 +++ src/Formats/FormatFactory.cpp | 3 +++ src/Formats/FormatSettings.h | 3 +++ .../Formats/Impl/CSVRowInputFormat.cpp | 14 ++++++++++++++ .../Formats/Impl/CSVRowInputFormat.h | 1 + .../Impl/CustomSeparatedRowInputFormat.cpp | 4 ++++ .../Impl/TabSeparatedRowInputFormat.cpp | 14 ++++++++++++++ .../Formats/Impl/TabSeparatedRowInputFormat.h | 2 ++ ..._custom_skip_trailing_empty_lines.reference | 3 +++ ...sv_csv_custom_skip_trailing_empty_lines.sql | 12 ++++++++++++ 12 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02771_tsv_csv_custom_skip_trailing_empty_lines.reference create mode 100644 tests/queries/0_stateless/02771_tsv_csv_custom_skip_trailing_empty_lines.sql diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 2ab9e8caec4..70479b8ac71 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -193,6 +193,7 @@ SELECT * FROM nestedt FORMAT TSV - [output_format_tsv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#output_format_tsv_crlf_end_of_line) - if it is set true, end of line in TSV output format will be `\r\n` instead of `\n`. Default value - `false`. - [input_format_tsv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_first_lines) - skip specified number of lines at the beginning of data. Default value - `0`. - [input_format_tsv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_detect_header) - automatically detect header with names and types in TSV format. Default value - `true`. +- [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`. ## TabSeparatedRaw {#tabseparatedraw} @@ -467,6 +468,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe - [output_format_csv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#output_format_csv_crlf_end_of_line) - if it is set to true, end of line in CSV output format will be `\r\n` instead of `\n`. Default value - `false`. - [input_format_csv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_first_lines) - skip the specified number of lines at the beginning of data. Default value - `0`. - [input_format_csv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_csv_detect_header) - automatically detect header with names and types in CSV format. Default value - `true`. +- [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`. ## CSVWithNames {#csvwithnames} @@ -494,7 +496,9 @@ the types from input data will be compared with the types of the corresponding c Similar to [Template](#format-template), but it prints or reads all names and types of columns and uses escaping rule from [format_custom_escaping_rule](/docs/en/operations/settings/settings-formats.md/#format_custom_escaping_rule) setting and delimiters from [format_custom_field_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_field_delimiter), [format_custom_row_before_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_before_delimiter), [format_custom_row_after_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_after_delimiter), [format_custom_row_between_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_between_delimiter), [format_custom_result_before_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_result_before_delimiter) and [format_custom_result_after_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_result_after_delimiter) settings, not from format strings. -If setting [input_format_custom_detect_header](/docs/en/operations/settings/settings.md/#input_format_custom_detect_header) is enabled, ClickHouse will automatically detect header with names and types if any. +If setting [input_format_custom_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) is enabled, ClickHouse will automatically detect header with names and types if any. + +If setting [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) is enabled, trailing empty lines at the end of file will be skipped. There is also `CustomSeparatedIgnoreSpaces` format, which is similar to [TemplateIgnoreSpaces](#templateignorespaces). diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 65038d3a256..a1a75446c37 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -728,6 +728,12 @@ My NULL My NULL ``` +### input_format_tsv_skip_trailing_empty_lines {input_format_tsv_skip_trailing_empty_lines} + +When enabled, trailing empty lines at the end of TSV file will be skipped. + +Disabled by default. + ## CSV format settings {#csv-format-settings} ### format_csv_delimiter {#format_csv_delimiter} @@ -882,6 +888,12 @@ My NULL My NULL ``` +### input_format_csv_skip_trailing_empty_lines {input_format_csv_skip_trailing_empty_lines} + +When enabled, trailing empty lines at the end of CSV file will be skipped. + +Disabled by default. + ## Values format settings {#values-format-settings} ### input_format_values_interpret_expressions {#input_format_values_interpret_expressions} @@ -1443,6 +1455,12 @@ Sets the character that is interpreted as a suffix after the result set for [Cus Default value: `''`. +### input_format_custom_skip_trailing_empty_lines {input_format_custom_skip_trailing_empty_lines} + +When enabled, trailing empty lines at the end of file in CustomSeparated format will be skipped. + +Disabled by default. + ## Regexp format settings {#regexp-format-settings} ### format_regexp_escaping_rule {#format_regexp_escaping_rule} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 67c92a0be8b..f688811028e 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -873,6 +873,9 @@ class IColumn; M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \ M(UInt64, input_format_csv_skip_first_lines, 0, "Skip specified number of lines at the beginning of data in CSV format", 0) \ M(UInt64, input_format_tsv_skip_first_lines, 0, "Skip specified number of lines at the beginning of data in TSV format", 0) \ + M(Bool, input_format_csv_skip_trailing_empty_lines, false, "Skip trailing empty lines in CSV format", 0) \ + M(Bool, input_format_tsv_skip_trailing_empty_lines, false, "Skip trailing empty lines in TSV format", 0) \ + M(Bool, input_format_custom_skip_trailing_empty_lines, false, "Skip trailing empty lines in CustomSeparated format", 0) \ \ M(Bool, input_format_native_allow_types_conversion, true, "Allow data types conversion in Native input format", 0) \ \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 586e1bb7251..021ccd35602 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -69,6 +69,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference; format_settings.csv.skip_first_lines = settings.input_format_csv_skip_first_lines; format_settings.csv.try_detect_header = settings.input_format_csv_detect_header; + format_settings.csv.skip_trailing_empty_lines = settings.input_format_csv_skip_trailing_empty_lines; format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter; format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter; format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter; @@ -80,6 +81,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.custom.row_before_delimiter = settings.format_custom_row_before_delimiter; format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter; format_settings.custom.try_detect_header = settings.input_format_custom_detect_header; + format_settings.custom.skip_trailing_empty_lines = settings.input_format_custom_skip_trailing_empty_lines; format_settings.date_time_input_format = settings.date_time_input_format; format_settings.date_time_output_format = settings.date_time_output_format; format_settings.input_format_ipv4_default_on_conversion_error = settings.input_format_ipv4_default_on_conversion_error; @@ -149,6 +151,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.tsv.use_best_effort_in_schema_inference = settings.input_format_tsv_use_best_effort_in_schema_inference; format_settings.tsv.skip_first_lines = settings.input_format_tsv_skip_first_lines; format_settings.tsv.try_detect_header = settings.input_format_tsv_detect_header; + format_settings.tsv.skip_trailing_empty_lines = settings.input_format_tsv_skip_trailing_empty_lines; format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals; format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions; format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index e332bd749a1..9d6cd384b68 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -136,6 +136,7 @@ struct FormatSettings UInt64 skip_first_lines = 0; String custom_delimiter; bool try_detect_header = true; + bool skip_trailing_empty_lines = false; } csv; struct HiveText @@ -156,6 +157,7 @@ struct FormatSettings std::string field_delimiter; EscapingRule escaping_rule = EscapingRule::Escaped; bool try_detect_header = true; + bool skip_trailing_empty_lines = false; } custom; struct @@ -291,6 +293,7 @@ struct FormatSettings bool use_best_effort_in_schema_inference = true; UInt64 skip_first_lines = 0; bool try_detect_header = true; + bool skip_trailing_empty_lines = false; } tsv; struct diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index de955d81651..f01f20a0a3c 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -322,6 +322,20 @@ void CSVFormatReader::setReadBuffer(ReadBuffer & in_) FormatWithNamesAndTypesReader::setReadBuffer(*buf); } +bool CSVFormatReader::checkForSuffix() +{ + if (!format_settings.csv.skip_trailing_empty_lines) + return buf->eof(); + + PeekableReadBufferCheckpoint checkpoint(*buf); + while (checkChar('\n', *buf) || checkChar('\r', *buf)); + if (buf->eof()) + return true; + + buf->rollbackToCheckpoint(); + return false; +} + CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesSchemaReader( buf, diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index f51f674e4af..0c8099a216c 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -75,6 +75,7 @@ public: std::vector readRow() { return readRowImpl(); } std::vector readRowForHeaderDetection() override { return readHeaderRow(); } + bool checkForSuffix() override; template std::vector readRowImpl(); diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index 1c2efe3a41d..1e67db79a2c 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -283,6 +283,8 @@ bool CustomSeparatedFormatReader::checkForSuffixImpl(bool check_eof) /// Allow optional \n before eof. checkChar('\n', *buf); + if (format_settings.custom.skip_trailing_empty_lines) + while (checkChar('\n', *buf) || checkChar('\r', *buf)); return buf->eof(); } @@ -294,6 +296,8 @@ bool CustomSeparatedFormatReader::checkForSuffixImpl(bool check_eof) /// Allow optional \n before eof. checkChar('\n', *buf); + if (format_settings.custom.skip_trailing_empty_lines) + while (checkChar('\n', *buf) || checkChar('\r', *buf)); if (buf->eof()) return true; } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index af5f1f90732..2239c8539e3 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -286,6 +286,20 @@ void TabSeparatedFormatReader::setReadBuffer(ReadBuffer & in_) FormatWithNamesAndTypesReader::setReadBuffer(*buf); } +bool TabSeparatedFormatReader::checkForSuffix() +{ + if (!format_settings.tsv.skip_trailing_empty_lines) + return buf->eof(); + + PeekableReadBufferCheckpoint checkpoint(*buf); + while (checkChar('\n', *buf) || checkChar('\r', *buf)); + if (buf->eof()) + return true; + + buf->rollbackToCheckpoint(); + return false; +} + TabSeparatedSchemaReader::TabSeparatedSchemaReader( ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesSchemaReader( diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index 0f4bff8d7d0..8df57675cf5 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -75,6 +75,8 @@ public: void setReadBuffer(ReadBuffer & in_) override; + bool checkForSuffix() override; + private: template std::vector readRowImpl(); diff --git a/tests/queries/0_stateless/02771_tsv_csv_custom_skip_trailing_empty_lines.reference b/tests/queries/0_stateless/02771_tsv_csv_custom_skip_trailing_empty_lines.reference new file mode 100644 index 00000000000..37e32ce62ee --- /dev/null +++ b/tests/queries/0_stateless/02771_tsv_csv_custom_skip_trailing_empty_lines.reference @@ -0,0 +1,3 @@ +1 2 +1 2 +1 2 diff --git a/tests/queries/0_stateless/02771_tsv_csv_custom_skip_trailing_empty_lines.sql b/tests/queries/0_stateless/02771_tsv_csv_custom_skip_trailing_empty_lines.sql new file mode 100644 index 00000000000..917a434cd58 --- /dev/null +++ b/tests/queries/0_stateless/02771_tsv_csv_custom_skip_trailing_empty_lines.sql @@ -0,0 +1,12 @@ +select * from format(TSV, 'x UInt32, y UInt32', '1\t2\n\n') settings input_format_tsv_skip_trailing_empty_lines=0; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED} +select * from format(TSV, 'x UInt32, y UInt32', '1\t2\n\n') settings input_format_tsv_skip_trailing_empty_lines=1; +select * from format(TSV, 'x UInt32, y UInt32', '1\t2\n\n1\t2\n') settings input_format_tsv_skip_trailing_empty_lines=1; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED} + +select * from format(CSV, 'x UInt32, y UInt32', '1,2\n\n') settings input_format_csv_skip_trailing_empty_lines=0; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED} +select * from format(CSV, 'x UInt32, y UInt32', '1,2\n\n') settings input_format_csv_skip_trailing_empty_lines=1; +select * from format(CSV, 'x UInt32, y UInt32', '1,2\n\n1,2\n') settings input_format_csv_skip_trailing_empty_lines=1; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED} + +select * from format(CustomSeparated, 'x UInt32, y UInt32', '1\t2\n\n\n') settings input_format_custom_skip_trailing_empty_lines=0; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED} +select * from format(CustomSeparated, 'x UInt32, y UInt32', '1\t2\n\n\n') settings input_format_custom_skip_trailing_empty_lines=1; +select * from format(CustomSeparated, 'x UInt32, y UInt32', '1\t2\n\n\n1\t2\n\n\n') settings input_format_custom_skip_trailing_empty_lines=1; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED} + From b6c8ce30ec5d8eeff846b273ed8aa2e6b00241f0 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Tue, 6 Jun 2023 19:38:11 +0200 Subject: [PATCH 0570/1072] Disable 01676_clickhouse_client_autocomplete under UBSan --- .../queries/0_stateless/01676_clickhouse_client_autocomplete.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01676_clickhouse_client_autocomplete.sh b/tests/queries/0_stateless/01676_clickhouse_client_autocomplete.sh index 42ae5e84f44..db62dedb5b4 100755 --- a/tests/queries/0_stateless/01676_clickhouse_client_autocomplete.sh +++ b/tests/queries/0_stateless/01676_clickhouse_client_autocomplete.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long +# Tags: long, no-ubsan CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From d902592703757c161c8502dd07793c7da62f9b17 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 6 Jun 2023 19:38:15 +0200 Subject: [PATCH 0571/1072] Fix new tests --- .../test_storage_azure_blob_storage/test.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 17817ca4e8e..e4e459428d0 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -467,14 +467,14 @@ def test_simple_write_account_string_table_function(cluster): def test_simple_write_connection_string_table_function(cluster): node = cluster.instances["node"] - azure_query(node, "INSERT INTO TABLE FUNCTION azure_blob('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1;', 'cont', 'test_simple_write_connection_tf.csv', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')") + azure_query(node, "INSERT INTO TABLE FUNCTION azure_blob_storage('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1;', 'cont', 'test_simple_write_connection_tf.csv', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')") print(get_azure_file_content("test_simple_write_connection_tf.csv")) assert get_azure_file_content("test_simple_write_connection_tf.csv") == '1,"a"\n' def test_simple_write_named_collection_1_table_function(cluster): node = cluster.instances["node"] - azure_query(node, "INSERT INTO TABLE FUNCTION azure_blob(azure_conf1) VALUES (1, 'a')") + azure_query(node, "INSERT INTO TABLE FUNCTION azure_blob_storage(azure_conf1) VALUES (1, 'a')") print(get_azure_file_content("test_simple_write_named.csv")) assert get_azure_file_content("test_simple_write_named.csv") == '1,"a"\n' @@ -482,7 +482,7 @@ def test_simple_write_named_collection_1_table_function(cluster): def test_simple_write_named_collection_2_table_function(cluster): node = cluster.instances["node"] - azure_query(node, "INSERT INTO TABLE FUNCTION azure_blob(azure_conf2, container='cont', blob_path='test_simple_write_named_2_tf.csv', format='CSV', structure='key UInt64, data String') VALUES (1, 'a')") + azure_query(node, "INSERT INTO TABLE FUNCTION azure_blob_storage(azure_conf2, container='cont', blob_path='test_simple_write_named_2_tf.csv', format='CSV', structure='key UInt64, data String') VALUES (1, 'a')") print(get_azure_file_content("test_simple_write_named_2_tf.csv")) assert get_azure_file_content("test_simple_write_named_2_tf.csv") == '1,"a"\n' @@ -502,9 +502,9 @@ def test_put_get_with_globs_tf(cluster): azure_query( node, - f"INSERT INTO TABLE FUNCTION azure_blob(azure_conf2, container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values}", + f"INSERT INTO TABLE FUNCTION azure_blob_storage(azure_conf2, container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values}", ) - query = f"select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from azure_blob(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv', format='CSV', structure='{table_format}')" + query = f"select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from azure_blob_storage(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv', format='CSV', structure='{table_format}')" assert azure_query(node, query).splitlines() == [ "450\t450\t900\t0.csv\t{bucket}/{max_path}".format( bucket="cont", max_path=max_path @@ -515,10 +515,10 @@ def test_schema_inference_no_globs_tf(cluster): node = cluster.instances["node"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 String, column3 UInt32" - query = f"insert into table function azure_blob(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs_tf.csv', format='CSVWithNames', structure='{table_format}') SELECT number, toString(number), number * number FROM numbers(1000)" + query = f"insert into table function azure_blob_storage(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs_tf.csv', format='CSVWithNames', structure='{table_format}') SELECT number, toString(number), number * number FROM numbers(1000)" azure_query(node, query) - query = "select sum(column1), sum(length(column2)), sum(column3), min(_file), max(_path) from azure_blob(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs_tf.csv')" + query = "select sum(column1), sum(length(column2)), sum(column3), min(_file), max(_path) from azure_blob_storage(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs_tf.csv')" assert azure_query(node, query).splitlines() == [ "499500\t2890\t332833500\ttest_schema_inference_no_globs_tf.csv\tcont/test_schema_inference_no_globs_tf.csv" ] @@ -537,10 +537,10 @@ def test_schema_inference_from_globs_tf(cluster): max_path = max(path, max_path) values = f"({i},{j},{i + j})" - query = f"insert into table function azure_blob(azure_conf2, container='cont', blob_path='{path}', format='CSVWithNames', structure='{table_format}') VALUES {values}" + query = f"insert into table function azure_blob_storage(azure_conf2, container='cont', blob_path='{path}', format='CSVWithNames', structure='{table_format}') VALUES {values}" azure_query(node, query) - query = f"select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from azure_blob(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv')" + query = f"select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from azure_blob_storage(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv')" assert azure_query(node, query).splitlines() == [ "450\t450\t900\t0.csv\t{bucket}/{max_path}".format( bucket="cont", max_path=max_path @@ -554,7 +554,7 @@ def test_partition_by_tf(cluster): values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)" filename = "test_tf_{_partition_id}.csv" - azure_query(node, f"INSERT INTO TABLE FUNCTION azure_blob('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', '{table_format}') PARTITION BY {partition_by} VALUES {values}") + azure_query(node, f"INSERT INTO TABLE FUNCTION azure_blob_storage('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', '{table_format}') PARTITION BY {partition_by} VALUES {values}") assert "1,2,3\n" == get_azure_file_content("test_tf_3.csv") assert "3,2,1\n" == get_azure_file_content("test_tf_1.csv") From 934df5e5bb3714cf031104deff914cde9987eb31 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 6 Jun 2023 19:44:41 +0200 Subject: [PATCH 0572/1072] Rename to AzureBlobStorage --- src/Storages/StorageAzure.cpp | 2 +- .../test_storage_azure_blob_storage/test.py | 46 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index c6001f59b6f..46446940f75 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -251,7 +251,7 @@ AzureObjectStorage::SettingsPtr StorageAzure::createSettings(ContextPtr local_co void registerStorageAzure(StorageFactory & factory) { - factory.registerStorage("Azure", [](const StorageFactory::Arguments & args) + factory.registerStorage("AzureBlobStorage", [](const StorageFactory::Arguments & args) { auto & engine_args = args.engine_args; if (engine_args.empty()) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index e4e459428d0..9dea5d24686 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -81,7 +81,7 @@ def test_create_table_connection_string(cluster): node = cluster.instances["node"] azure_query( node, - "CREATE TABLE test_create_table_conn_string (key UInt64, data String) Engine = Azure('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1/;', 'cont', 'test_create_connection_string', 'CSV')", + "CREATE TABLE test_create_table_conn_string (key UInt64, data String) Engine = AzureBlobStorage('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1/;', 'cont', 'test_create_connection_string', 'CSV')", ) @@ -89,7 +89,7 @@ def test_create_table_account_string(cluster): node = cluster.instances["node"] azure_query( node, - "CREATE TABLE test_create_table_account_url (key UInt64, data String) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', 'test_create_connection_string', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV')", + "CREATE TABLE test_create_table_account_url (key UInt64, data String) Engine = AzureBlobStorage('http://azurite1:10000/devstoreaccount1', 'cont', 'test_create_connection_string', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV')", ) @@ -97,7 +97,7 @@ def test_simple_write_account_string(cluster): node = cluster.instances["node"] azure_query( node, - "CREATE TABLE test_simple_write (key UInt64, data String) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', 'test_simple_write.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV')", + "CREATE TABLE test_simple_write (key UInt64, data String) Engine = AzureBlobStorage('http://azurite1:10000/devstoreaccount1', 'cont', 'test_simple_write.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV')", ) azure_query(node, "INSERT INTO test_simple_write VALUES (1, 'a')") print(get_azure_file_content("test_simple_write.csv")) @@ -108,7 +108,7 @@ def test_simple_write_connection_string(cluster): node = cluster.instances["node"] azure_query( node, - "CREATE TABLE test_simple_write_connection_string (key UInt64, data String) Engine = Azure('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1;', 'cont', 'test_simple_write_c.csv', 'CSV')", + "CREATE TABLE test_simple_write_connection_string (key UInt64, data String) Engine = AzureBlobStorage('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1;', 'cont', 'test_simple_write_c.csv', 'CSV')", ) azure_query(node, "INSERT INTO test_simple_write_connection_string VALUES (1, 'a')") print(get_azure_file_content("test_simple_write_c.csv")) @@ -119,7 +119,7 @@ def test_simple_write_named_collection_1(cluster): node = cluster.instances["node"] azure_query( node, - "CREATE TABLE test_simple_write_named_collection_1 (key UInt64, data String) Engine = Azure(azure_conf1)", + "CREATE TABLE test_simple_write_named_collection_1 (key UInt64, data String) Engine = AzureBlobStorage(azure_conf1)", ) azure_query( node, "INSERT INTO test_simple_write_named_collection_1 VALUES (1, 'a')" @@ -132,7 +132,7 @@ def test_simple_write_named_collection_2(cluster): node = cluster.instances["node"] azure_query( node, - "CREATE TABLE test_simple_write_named_collection_2 (key UInt64, data String) Engine = Azure(azure_conf2, container='cont', blob_path='test_simple_write_named_2.csv', format='CSV')", + "CREATE TABLE test_simple_write_named_collection_2 (key UInt64, data String) Engine = AzureBlobStorage(azure_conf2, container='cont', blob_path='test_simple_write_named_2.csv', format='CSV')", ) azure_query( node, "INSERT INTO test_simple_write_named_collection_2 VALUES (1, 'a')" @@ -150,7 +150,7 @@ def test_partition_by(cluster): azure_query( node, - f"CREATE TABLE test_partitioned_write ({table_format}) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV') PARTITION BY {partition_by}", + f"CREATE TABLE test_partitioned_write ({table_format}) Engine = AzureBlobStorage('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV') PARTITION BY {partition_by}", ) azure_query(node, f"INSERT INTO test_partitioned_write VALUES {values}") @@ -167,7 +167,7 @@ def test_partition_by_string_column(cluster): filename = "test_{_partition_id}.csv" azure_query( node, - f"CREATE TABLE test_partitioned_string_write ({table_format}) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV') PARTITION BY {partition_by}", + f"CREATE TABLE test_partitioned_string_write ({table_format}) Engine = AzureBlobStorage('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV') PARTITION BY {partition_by}", ) azure_query(node, f"INSERT INTO test_partitioned_string_write VALUES {values}") @@ -185,7 +185,7 @@ def test_partition_by_const_column(cluster): filename = "test_{_partition_id}.csv" azure_query( node, - f"CREATE TABLE test_partitioned_const_write ({table_format}) Engine = Azure('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV') PARTITION BY {partition_by}", + f"CREATE TABLE test_partitioned_const_write ({table_format}) Engine = AzureBlobStorage('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV') PARTITION BY {partition_by}", ) azure_query(node, f"INSERT INTO test_partitioned_const_write VALUES {values}") assert values_csv == get_azure_file_content("test_88.csv") @@ -195,7 +195,7 @@ def test_truncate(cluster): node = cluster.instances["node"] azure_query( node, - "CREATE TABLE test_truncate (key UInt64, data String) Engine = Azure(azure_conf2, container='cont', blob_path='test_truncate.csv', format='CSV')", + "CREATE TABLE test_truncate (key UInt64, data String) Engine = AzureBlobStorage(azure_conf2, container='cont', blob_path='test_truncate.csv', format='CSV')", ) azure_query(node, "INSERT INTO test_truncate VALUES (1, 'a')") assert get_azure_file_content("test_truncate.csv") == '1,"a"\n' @@ -208,7 +208,7 @@ def test_simple_read_write(cluster): node = cluster.instances["node"] azure_query( node, - "CREATE TABLE test_simple_read_write (key UInt64, data String) Engine = Azure(azure_conf2, container='cont', blob_path='test_simple_read_write.csv', format='CSV')", + "CREATE TABLE test_simple_read_write (key UInt64, data String) Engine = AzureBlobStorage(azure_conf2, container='cont', blob_path='test_simple_read_write.csv', format='CSV')", ) azure_query(node, "INSERT INTO test_simple_read_write VALUES (1, 'a')") @@ -222,7 +222,7 @@ def test_create_new_files_on_insert(cluster): azure_query( node, - f"create table test_multiple_inserts(a Int32, b String) ENGINE = Azure(azure_conf2, container='cont', blob_path='test_parquet', format='Parquet')", + f"create table test_multiple_inserts(a Int32, b String) ENGINE = AzureBlobStorage(azure_conf2, container='cont', blob_path='test_parquet', format='Parquet')", ) azure_query(node, "truncate table test_multiple_inserts") azure_query( @@ -249,7 +249,7 @@ def test_overwrite(cluster): azure_query( node, - f"create table test_overwrite(a Int32, b String) ENGINE = Azure(azure_conf2, container='cont', blob_path='test_parquet_overwrite', format='Parquet')", + f"create table test_overwrite(a Int32, b String) ENGINE = AzureBlobStorage(azure_conf2, container='cont', blob_path='test_parquet_overwrite', format='Parquet')", ) azure_query(node, "truncate table test_overwrite") @@ -273,7 +273,7 @@ def test_insert_with_path_with_globs(cluster): node = cluster.instances["node"] azure_query( node, - f"create table test_insert_globs(a Int32, b String) ENGINE = Azure(azure_conf2, container='cont', blob_path='test_insert_with_globs*', format='Parquet')", + f"create table test_insert_globs(a Int32, b String) ENGINE = AzureBlobStorage(azure_conf2, container='cont', blob_path='test_insert_with_globs*', format='Parquet')", ) node.query_and_get_error( f"insert into table function test_insert_globs SELECT number, randomString(100) FROM numbers(500)" @@ -296,7 +296,7 @@ def test_put_get_with_globs(cluster): azure_query( node, - f"CREATE TABLE test_{i}_{j} ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='{path}', format='CSV')", + f"CREATE TABLE test_{i}_{j} ({table_format}) Engine = AzureBlobStorage(azure_conf2, container='cont', blob_path='{path}', format='CSV')", ) query = f"insert into test_{i}_{j} VALUES {values}" @@ -304,7 +304,7 @@ def test_put_get_with_globs(cluster): azure_query( node, - f"CREATE TABLE test_glob_select ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv', format='CSV')", + f"CREATE TABLE test_glob_select ({table_format}) Engine = AzureBlobStorage(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv', format='CSV')", ) query = "select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from test_glob_select" assert azure_query(node, query).splitlines() == [ @@ -328,7 +328,7 @@ def test_azure_glob_scheherazade(cluster): unique_num = random.randint(1, 10000) azure_query( node, - f"CREATE TABLE test_{i}_{unique_num} ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='{path}', format='CSV')", + f"CREATE TABLE test_{i}_{unique_num} ({table_format}) Engine = AzureBlobStorage(azure_conf2, container='cont', blob_path='{path}', format='CSV')", ) query = f"insert into test_{i}_{unique_num} VALUES {values}" azure_query(node, query) @@ -345,7 +345,7 @@ def test_azure_glob_scheherazade(cluster): azure_query( node, - f"CREATE TABLE test_glob_select_scheherazade ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='night_*/tale.csv', format='CSV')", + f"CREATE TABLE test_glob_select_scheherazade ({table_format}) Engine = AzureBlobStorage(azure_conf2, container='cont', blob_path='night_*/tale.csv', format='CSV')", ) query = "select count(), sum(column1), sum(column2), sum(column3) from test_glob_select_scheherazade" assert azure_query(node, query).splitlines() == ["1001\t1001\t1001\t1001"] @@ -387,7 +387,7 @@ def test_storage_azure_get_gzip(cluster, extension, method): azure_query( node, - f"""CREATE TABLE {name} (name String, id UInt32) ENGINE = Azure( + f"""CREATE TABLE {name} (name String, id UInt32) ENGINE = AzureBlobStorage( azure_conf2, container='cont', blob_path ='{filename}', format='CSV', compression='{method}')""", @@ -402,7 +402,7 @@ def test_schema_inference_no_globs(cluster): table_format = "column1 UInt32, column2 String, column3 UInt32" azure_query( node, - f"CREATE TABLE test_schema_inference_src ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs.csv', format='CSVWithNames')", + f"CREATE TABLE test_schema_inference_src ({table_format}) Engine = AzureBlobStorage(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs.csv', format='CSVWithNames')", ) query = f"insert into test_schema_inference_src SELECT number, toString(number), number * number FROM numbers(1000)" @@ -410,7 +410,7 @@ def test_schema_inference_no_globs(cluster): azure_query( node, - f"CREATE TABLE test_select_inference Engine = Azure(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs.csv')", + f"CREATE TABLE test_select_inference Engine = AzureBlobStorage(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs.csv')", ) print(node.query("SHOW CREATE TABLE test_select_inference")) @@ -437,7 +437,7 @@ def test_schema_inference_from_globs(cluster): azure_query( node, - f"CREATE TABLE test_schema_{i}_{j} ({table_format}) Engine = Azure(azure_conf2, container='cont', blob_path='{path}', format='CSVWithNames')", + f"CREATE TABLE test_schema_{i}_{j} ({table_format}) Engine = AzureBlobStorage(azure_conf2, container='cont', blob_path='{path}', format='CSVWithNames')", ) query = f"insert into test_schema_{i}_{j} VALUES {values}" @@ -445,7 +445,7 @@ def test_schema_inference_from_globs(cluster): azure_query( node, - f"CREATE TABLE test_glob_select_inference Engine = Azure(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv')", + f"CREATE TABLE test_glob_select_inference Engine = AzureBlobStorage(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv')", ) print(node.query("SHOW CREATE TABLE test_glob_select_inference")) From e9c267ed696f30d440b35ed3bb215d550eb8aec7 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 6 Jun 2023 17:51:53 +0000 Subject: [PATCH 0573/1072] Fix converting Null to LowCardinality(Nullable) in values table function --- src/Interpreters/convertFieldToType.cpp | 2 +- .../0_stateless/02782_values_null_to_lc_nullable.reference | 1 + tests/queries/0_stateless/02782_values_null_to_lc_nullable.sql | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02782_values_null_to_lc_nullable.reference create mode 100644 tests/queries/0_stateless/02782_values_null_to_lc_nullable.sql diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index dc61e748db6..3e8fab80aaf 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -534,7 +534,7 @@ Field convertFieldToType(const Field & from_value, const IDataType & to_type, co Field convertFieldToTypeOrThrow(const Field & from_value, const IDataType & to_type, const IDataType * from_type_hint) { bool is_null = from_value.isNull(); - if (is_null && !to_type.isNullable()) + if (is_null && !to_type.isNullable() && !to_type.isLowCardinalityNullable()) throw Exception(ErrorCodes::TYPE_MISMATCH, "Cannot convert NULL to {}", to_type.getName()); Field converted = convertFieldToType(from_value, to_type, from_type_hint); diff --git a/tests/queries/0_stateless/02782_values_null_to_lc_nullable.reference b/tests/queries/0_stateless/02782_values_null_to_lc_nullable.reference new file mode 100644 index 00000000000..dec7d2fabd2 --- /dev/null +++ b/tests/queries/0_stateless/02782_values_null_to_lc_nullable.reference @@ -0,0 +1 @@ +\N diff --git a/tests/queries/0_stateless/02782_values_null_to_lc_nullable.sql b/tests/queries/0_stateless/02782_values_null_to_lc_nullable.sql new file mode 100644 index 00000000000..250fe6b7551 --- /dev/null +++ b/tests/queries/0_stateless/02782_values_null_to_lc_nullable.sql @@ -0,0 +1,2 @@ +select * from values('s LowCardinality(Nullable(String))', (NULL)); + From 95b054b42523f3e8271a095bbb570c1d02181ccd Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 6 Jun 2023 18:01:22 +0000 Subject: [PATCH 0574/1072] Automatic style fix --- .../test_storage_azure_blob_storage/test.py | 29 +++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 9dea5d24686..8a0a68f5200 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -460,21 +460,30 @@ def test_schema_inference_from_globs(cluster): def test_simple_write_account_string_table_function(cluster): node = cluster.instances["node"] - azure_query(node, "INSERT INTO TABLE FUNCTION azure_blob_storage('http://azurite1:10000/devstoreaccount1', 'cont', 'test_simple_write_tf.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')") + azure_query( + node, + "INSERT INTO TABLE FUNCTION azure_blob_storage('http://azurite1:10000/devstoreaccount1', 'cont', 'test_simple_write_tf.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')", + ) print(get_azure_file_content("test_simple_write_tf.csv")) assert get_azure_file_content("test_simple_write_tf.csv") == '1,"a"\n' def test_simple_write_connection_string_table_function(cluster): node = cluster.instances["node"] - azure_query(node, "INSERT INTO TABLE FUNCTION azure_blob_storage('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1;', 'cont', 'test_simple_write_connection_tf.csv', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')") + azure_query( + node, + "INSERT INTO TABLE FUNCTION azure_blob_storage('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1;', 'cont', 'test_simple_write_connection_tf.csv', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')", + ) print(get_azure_file_content("test_simple_write_connection_tf.csv")) assert get_azure_file_content("test_simple_write_connection_tf.csv") == '1,"a"\n' def test_simple_write_named_collection_1_table_function(cluster): node = cluster.instances["node"] - azure_query(node, "INSERT INTO TABLE FUNCTION azure_blob_storage(azure_conf1) VALUES (1, 'a')") + azure_query( + node, + "INSERT INTO TABLE FUNCTION azure_blob_storage(azure_conf1) VALUES (1, 'a')", + ) print(get_azure_file_content("test_simple_write_named.csv")) assert get_azure_file_content("test_simple_write_named.csv") == '1,"a"\n' @@ -482,10 +491,14 @@ def test_simple_write_named_collection_1_table_function(cluster): def test_simple_write_named_collection_2_table_function(cluster): node = cluster.instances["node"] - azure_query(node, "INSERT INTO TABLE FUNCTION azure_blob_storage(azure_conf2, container='cont', blob_path='test_simple_write_named_2_tf.csv', format='CSV', structure='key UInt64, data String') VALUES (1, 'a')") + azure_query( + node, + "INSERT INTO TABLE FUNCTION azure_blob_storage(azure_conf2, container='cont', blob_path='test_simple_write_named_2_tf.csv', format='CSV', structure='key UInt64, data String') VALUES (1, 'a')", + ) print(get_azure_file_content("test_simple_write_named_2_tf.csv")) assert get_azure_file_content("test_simple_write_named_2_tf.csv") == '1,"a"\n' + def test_put_get_with_globs_tf(cluster): # type: (ClickHouseCluster) -> None unique_prefix = random.randint(1, 10000) @@ -511,6 +524,7 @@ def test_put_get_with_globs_tf(cluster): ) ] + def test_schema_inference_no_globs_tf(cluster): node = cluster.instances["node"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 String, column3 UInt32" @@ -523,6 +537,7 @@ def test_schema_inference_no_globs_tf(cluster): "499500\t2890\t332833500\ttest_schema_inference_no_globs_tf.csv\tcont/test_schema_inference_no_globs_tf.csv" ] + def test_schema_inference_from_globs_tf(cluster): node = cluster.instances["node"] unique_prefix = random.randint(1, 10000) @@ -547,6 +562,7 @@ def test_schema_inference_from_globs_tf(cluster): ) ] + def test_partition_by_tf(cluster): node = cluster.instances["node"] table_format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -554,7 +570,10 @@ def test_partition_by_tf(cluster): values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)" filename = "test_tf_{_partition_id}.csv" - azure_query(node, f"INSERT INTO TABLE FUNCTION azure_blob_storage('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', '{table_format}') PARTITION BY {partition_by} VALUES {values}") + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azure_blob_storage('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', '{table_format}') PARTITION BY {partition_by} VALUES {values}", + ) assert "1,2,3\n" == get_azure_file_content("test_tf_3.csv") assert "3,2,1\n" == get_azure_file_content("test_tf_1.csv") From 49b019b26dfb674d31fa752335711c135377df4c Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 6 Jun 2023 20:23:20 +0200 Subject: [PATCH 0575/1072] Refactored TableFunction name to TableFunctionAzureBlobStorage --- src/TableFunctions/ITableFunctionCluster.h | 10 ++++----- ....cpp => TableFunctionAzureBlobStorage.cpp} | 22 +++++++++---------- ...zure.h => TableFunctionAzureBlobStorage.h} | 2 +- src/TableFunctions/registerTableFunctions.cpp | 2 +- src/TableFunctions/registerTableFunctions.h | 2 +- 5 files changed, 19 insertions(+), 19 deletions(-) rename src/TableFunctions/{TableFunctionAzure.cpp => TableFunctionAzureBlobStorage.cpp} (88%) rename src/TableFunctions/{TableFunctionAzure.h => TableFunctionAzureBlobStorage.h} (97%) diff --git a/src/TableFunctions/ITableFunctionCluster.h b/src/TableFunctions/ITableFunctionCluster.h index f68558596ca..a8329684ee6 100644 --- a/src/TableFunctions/ITableFunctionCluster.h +++ b/src/TableFunctions/ITableFunctionCluster.h @@ -2,13 +2,13 @@ #include "config.h" -#include -#include -#include -#include -#include #include +#include +#include #include +#include +#include +#include namespace DB diff --git a/src/TableFunctions/TableFunctionAzure.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp similarity index 88% rename from src/TableFunctions/TableFunctionAzure.cpp rename to src/TableFunctions/TableFunctionAzureBlobStorage.cpp index e2d88a85eec..07bdb0f8393 100644 --- a/src/TableFunctions/TableFunctionAzure.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include @@ -46,13 +46,13 @@ bool isConnectionString(const std::string & candidate) } -StorageAzure::Configuration TableFunctionAzure::parseArgumentsImpl(ASTs & engine_args, const ContextPtr & local_context, bool get_format_from_file) +StorageAzure::Configuration TableFunctionAzureBlobStorage::parseArgumentsImpl(ASTs & engine_args, const ContextPtr & local_context, bool get_format_from_file) { StorageAzure::Configuration configuration; /// Supported signatures: /// - /// Azure(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure]) + /// AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure]) /// if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) @@ -70,7 +70,7 @@ StorageAzure::Configuration TableFunctionAzure::parseArgumentsImpl(ASTs & engine if (engine_args.size() < 3 || engine_args.size() > 8) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Storage Azure requires 3 to 7 arguments: " - "Azure(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])"); + "AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])"); for (auto & engine_arg : engine_args) engine_arg = evaluateConstantExpressionOrIdentifierAsLiteral(engine_arg, local_context); @@ -181,7 +181,7 @@ StorageAzure::Configuration TableFunctionAzure::parseArgumentsImpl(ASTs & engine return configuration; } -void TableFunctionAzure::parseArguments(const ASTPtr & ast_function, ContextPtr context) +void TableFunctionAzureBlobStorage::parseArguments(const ASTPtr & ast_function, ContextPtr context) { /// Clone ast function, because we can modify its arguments like removing headers. auto ast_copy = ast_function->clone(); @@ -197,7 +197,7 @@ void TableFunctionAzure::parseArguments(const ASTPtr & ast_function, ContextPtr LOG_DEBUG(&Poco::Logger::get("DEBUG"), "CONFIGURATION {}", configuration.connection_url); } -ColumnsDescription TableFunctionAzure::getActualTableStructure(ContextPtr context) const +ColumnsDescription TableFunctionAzureBlobStorage::getActualTableStructure(ContextPtr context) const { if (configuration.structure == "auto") { @@ -205,19 +205,19 @@ ColumnsDescription TableFunctionAzure::getActualTableStructure(ContextPtr contex auto client = StorageAzure::createClient(configuration); auto settings = StorageAzure::createSettings(context); - auto object_storage = std::make_unique("AzureTableFunction", std::move(client), std::move(settings)); + auto object_storage = std::make_unique("AzureBlobStorageTableFunction", std::move(client), std::move(settings)); return StorageAzure::getTableStructureFromData(object_storage.get(), configuration, std::nullopt, context); } return parseColumnsListFromString(configuration.structure, context); } -bool TableFunctionAzure::supportsReadingSubsetOfColumns() +bool TableFunctionAzureBlobStorage::supportsReadingSubsetOfColumns() { return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format); } -StoragePtr TableFunctionAzure::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +StoragePtr TableFunctionAzureBlobStorage::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const { auto client = StorageAzure::createClient(configuration); auto settings = StorageAzure::createSettings(context); @@ -245,9 +245,9 @@ StoragePtr TableFunctionAzure::executeImpl(const ASTPtr & /*ast_function*/, Cont return storage; } -void registerTableFunctionAzure(TableFunctionFactory & factory) +void registerTableFunctionAzureBlobStorage(TableFunctionFactory & factory) { - factory.registerFunction( + factory.registerFunction( {.documentation = {.description=R"(The table function can be used to read the data stored on Azure Blob Storage.)", .examples{{"azure_blob_storage", "SELECT * FROM azure_blob_storage(connection, container, blob_path, format, structure)", ""}}}, diff --git a/src/TableFunctions/TableFunctionAzure.h b/src/TableFunctions/TableFunctionAzureBlobStorage.h similarity index 97% rename from src/TableFunctions/TableFunctionAzure.h rename to src/TableFunctions/TableFunctionAzureBlobStorage.h index e2815973010..6f3a3422958 100644 --- a/src/TableFunctions/TableFunctionAzure.h +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.h @@ -15,7 +15,7 @@ class Context; /* AzureBlob(source, [access_key_id, secret_access_key,] [format, structure, compression]) - creates a temporary storage for a file in AzureBlob. */ -class TableFunctionAzure : public ITableFunction +class TableFunctionAzureBlobStorage : public ITableFunction { public: static constexpr auto name = "azure_blob_storage"; diff --git a/src/TableFunctions/registerTableFunctions.cpp b/src/TableFunctions/registerTableFunctions.cpp index e0114368e44..0499524a912 100644 --- a/src/TableFunctions/registerTableFunctions.cpp +++ b/src/TableFunctions/registerTableFunctions.cpp @@ -73,7 +73,7 @@ void registerTableFunctions() registerTableFunctionExplain(factory); #if USE_AZURE_BLOB_STORAGE - registerTableFunctionAzure(factory); + registerTableFunctionAzureBlobStorage(factory); #endif diff --git a/src/TableFunctions/registerTableFunctions.h b/src/TableFunctions/registerTableFunctions.h index fa4fec2b03a..393bc080a3d 100644 --- a/src/TableFunctions/registerTableFunctions.h +++ b/src/TableFunctions/registerTableFunctions.h @@ -70,7 +70,7 @@ void registerTableFunctionFormat(TableFunctionFactory & factory); void registerTableFunctionExplain(TableFunctionFactory & factory); #if USE_AZURE_BLOB_STORAGE -void registerTableFunctionAzure(TableFunctionFactory & factory); +void registerTableFunctionAzureBlobStorage(TableFunctionFactory & factory); #endif void registerTableFunctions(); From ceab5117a923be20e8caf46d6330cdb76e0f9a6b Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 6 Jun 2023 20:39:54 +0200 Subject: [PATCH 0576/1072] Fxi style --- src/Storages/StorageAzure.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index 46446940f75..f683a62e8e1 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -12,7 +12,6 @@ #include #include -#include #include #include #include @@ -29,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -806,7 +804,7 @@ StorageAzureSource::Iterator::Iterator( , outer_blobs(outer_blobs_) { if (keys.has_value() && blob_path_with_globs.has_value()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot specify keys and glob simulatenously it's a bug"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot specify keys and glob simultaneously it's a bug"); if (!keys.has_value() && !blob_path_with_globs.has_value()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Both keys and glob mask are not specified"); From 5637858182e98070a9435e858c83ccaca316dd0a Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 6 Jun 2023 21:06:45 +0200 Subject: [PATCH 0577/1072] Fix the most important check in the world --- src/Storages/StorageAzure.cpp | 3 +++ src/Storages/StorageAzure.h | 5 ----- src/Storages/registerStorages.cpp | 4 ---- src/TableFunctions/TableFunctionAzureBlobStorage.cpp | 2 +- 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzure.cpp index f683a62e8e1..ac7ae8a4b36 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzure.cpp @@ -60,6 +60,9 @@ namespace ErrorCodes extern const int DATABASE_ACCESS_DENIED; extern const int CANNOT_COMPILE_REGEXP; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int LOGICAL_ERROR; + extern const int NOT_IMPLEMENTED; + } namespace diff --git a/src/Storages/StorageAzure.h b/src/Storages/StorageAzure.h index 8341026b624..826582ccaa5 100644 --- a/src/Storages/StorageAzure.h +++ b/src/Storages/StorageAzure.h @@ -20,11 +20,6 @@ struct AzureSimpleAccountConfiguration std::string storage_account_url; }; -namespace ErrorCodes -{ - extern const int NOT_IMPLEMENTED; -} - using AzureConnectionString = std::string; using AzureCredentials = std::variant; diff --git a/src/Storages/registerStorages.cpp b/src/Storages/registerStorages.cpp index 03bd4dbb310..c66cf85a29b 100644 --- a/src/Storages/registerStorages.cpp +++ b/src/Storages/registerStorages.cpp @@ -94,14 +94,10 @@ void registerStorageFileLog(StorageFactory & factory); void registerStorageSQLite(StorageFactory & factory); #endif - - void registerStorageKeeperMap(StorageFactory & factory); #if USE_AZURE_BLOB_STORAGE - void registerStorageAzure(StorageFactory & factory); - #endif void registerStorages() diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp index 07bdb0f8393..96131bd884a 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp @@ -33,7 +33,7 @@ namespace DB namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; } namespace From 42c054789561920adf7ce4770968ba303a70f244 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 6 Jun 2023 19:25:43 +0000 Subject: [PATCH 0578/1072] Remove clang-tidy exclude --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 1a28f28f746..ffed9e01df0 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -123,7 +123,6 @@ MergeTreeIndexAggregatorAnnoy::MergeTreeIndexAggregatorAnnoy( template MergeTreeIndexGranulePtr MergeTreeIndexAggregatorAnnoy::getGranuleAndReset() { - // NOLINTNEXTLINE(*) index->build(static_cast(trees), /*number_of_threads=*/1); auto granule = std::make_shared>(index_name, index_sample_block, index); index = nullptr; From 99f0be8ef507b197f12f285548d94bbf1f4dc3c2 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 6 Jun 2023 21:58:54 +0200 Subject: [PATCH 0579/1072] Refactored to StorageAzureBlob --- ...{StorageAzure.cpp => StorageAzureBlob.cpp} | 124 +++++++++--------- .../{StorageAzure.h => StorageAzureBlob.h} | 18 +-- .../TableFunctionAzureBlobStorage.cpp | 20 +-- .../TableFunctionAzureBlobStorage.h | 6 +- 4 files changed, 84 insertions(+), 84 deletions(-) rename src/Storages/{StorageAzure.cpp => StorageAzureBlob.cpp} (90%) rename src/Storages/{StorageAzure.h => StorageAzureBlob.h} (93%) diff --git a/src/Storages/StorageAzure.cpp b/src/Storages/StorageAzureBlob.cpp similarity index 90% rename from src/Storages/StorageAzure.cpp rename to src/Storages/StorageAzureBlob.cpp index ac7ae8a4b36..17374ba2d92 100644 --- a/src/Storages/StorageAzure.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -1,4 +1,4 @@ -#include +#include #if USE_AZURE_BLOB_STORAGE @@ -91,7 +91,7 @@ bool isConnectionString(const std::string & candidate) } -void StorageAzure::processNamedCollectionResult(StorageAzure::Configuration & configuration, const NamedCollection & collection) +void StorageAzureBlob::processNamedCollectionResult(StorageAzureBlob::Configuration & configuration, const NamedCollection & collection) { validateNamedCollection(collection, required_configuration_keys, optional_configuration_keys); @@ -122,15 +122,15 @@ void StorageAzure::processNamedCollectionResult(StorageAzure::Configuration & co } -StorageAzure::Configuration StorageAzure::getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file) +StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file) { - LOG_INFO(&Poco::Logger::get("StorageAzure"), "get_format_from_file = {}", get_format_from_file); + LOG_INFO(&Poco::Logger::get("StorageAzureBlob"), "get_format_from_file = {}", get_format_from_file); - StorageAzure::Configuration configuration; + StorageAzureBlob::Configuration configuration; /// Supported signatures: /// - /// Azure(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression]) + /// AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression]) /// if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) @@ -147,8 +147,8 @@ StorageAzure::Configuration StorageAzure::getConfiguration(ASTs & engine_args, C if (engine_args.size() < 3 || engine_args.size() > 7) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Storage Azure requires 3 to 7 arguments: " - "Azure(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression])"); + "Storage AzureBlobStorage requires 3 to 7 arguments: " + "AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression])"); for (auto & engine_arg : engine_args) engine_arg = evaluateConstantExpressionOrIdentifierAsLiteral(engine_arg, local_context); @@ -239,7 +239,7 @@ StorageAzure::Configuration StorageAzure::getConfiguration(ASTs & engine_args, C } -AzureObjectStorage::SettingsPtr StorageAzure::createSettings(ContextPtr local_context) +AzureObjectStorage::SettingsPtr StorageAzureBlob::createSettings(ContextPtr local_context) { const auto & context_settings = local_context->getSettingsRef(); auto settings_ptr = std::make_unique(); @@ -250,7 +250,7 @@ AzureObjectStorage::SettingsPtr StorageAzure::createSettings(ContextPtr local_co return settings_ptr; } -void registerStorageAzure(StorageFactory & factory) +void registerStorageAzureBlob(StorageFactory & factory) { factory.registerStorage("AzureBlobStorage", [](const StorageFactory::Arguments & args) { @@ -258,8 +258,8 @@ void registerStorageAzure(StorageFactory & factory) if (engine_args.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "External data source must have arguments"); - auto configuration = StorageAzure::getConfiguration(engine_args, args.getLocalContext()); - auto client = StorageAzure::createClient(configuration); + auto configuration = StorageAzureBlob::getConfiguration(engine_args, args.getLocalContext()); + auto client = StorageAzureBlob::createClient(configuration); // Use format settings from global server context + settings from // the SETTINGS clause of the create query. Settings from current // session and user are ignored. @@ -290,11 +290,11 @@ void registerStorageAzure(StorageFactory & factory) if (args.storage_def->partition_by) partition_by = args.storage_def->partition_by->clone(); - auto settings = StorageAzure::createSettings(args.getContext()); + auto settings = StorageAzureBlob::createSettings(args.getContext()); - return std::make_shared( + return std::make_shared( std::move(configuration), - std::make_unique("AzureStorage", std::move(client), std::move(settings)), + std::make_unique("AzureBlobStorage", std::move(client), std::move(settings)), args.getContext(), args.table_id, args.columns, @@ -311,7 +311,7 @@ void registerStorageAzure(StorageFactory & factory) }); } -AzureClientPtr StorageAzure::createClient(StorageAzure::Configuration configuration) +AzureClientPtr StorageAzureBlob::createClient(StorageAzureBlob::Configuration configuration) { AzureClientPtr result; @@ -375,7 +375,7 @@ AzureClientPtr StorageAzure::createClient(StorageAzure::Configuration configurat return result; } -Poco::URI StorageAzure::Configuration::getConnectionURL() const +Poco::URI StorageAzureBlob::Configuration::getConnectionURL() const { if (!is_connection_string) return Poco::URI(connection_url); @@ -385,7 +385,7 @@ Poco::URI StorageAzure::Configuration::getConnectionURL() const } -StorageAzure::StorageAzure( +StorageAzureBlob::StorageAzureBlob( const Configuration & configuration_, std::unique_ptr && object_storage_, ContextPtr context, @@ -434,7 +434,7 @@ StorageAzure::StorageAzure( virtual_block.insert({column.type->createColumn(), column.type, column.name}); } -void StorageAzure::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &) +void StorageAzureBlob::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &) { if (configuration.withGlobs()) { @@ -454,10 +454,10 @@ void StorageAzure::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextP namespace { -class StorageAzureSink : public SinkToStorage +class StorageAzureBlobSink : public SinkToStorage { public: - StorageAzureSink( + StorageAzureBlobSink( const String & format, const Block & sample_block_, ContextPtr context, @@ -474,7 +474,7 @@ public: writer = FormatFactory::instance().getOutputFormatParallelIfPossible(format, *write_buf, sample_block, context, format_settings); } - String getName() const override { return "StorageS3Sink"; } + String getName() const override { return "StorageAzureBlobSink"; } void consume(Chunk chunk) override { @@ -532,10 +532,10 @@ private: std::mutex cancel_mutex; }; -class PartitionedStorageAzureSink : public PartitionedSink +class PartitionedStorageAzureBlobSink : public PartitionedSink { public: - PartitionedStorageAzureSink( + PartitionedStorageAzureBlobSink( const ASTPtr & partition_by, const String & format_, const Block & sample_block_, @@ -560,7 +560,7 @@ public: auto partition_key = replaceWildcards(blob, partition_id); validateKey(partition_key); - return std::make_shared( + return std::make_shared( format, sample_block, context, @@ -590,7 +590,7 @@ private: } -Pipe StorageAzure::read( +Pipe StorageAzureBlob::read( const Names & column_names, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, @@ -613,17 +613,17 @@ Pipe StorageAzure::read( requested_virtual_columns.push_back(virtual_column); } - std::shared_ptr iterator_wrapper; + std::shared_ptr iterator_wrapper; if (configuration.withGlobs()) { /// Iterate through disclosed globs and make a source for each file - iterator_wrapper = std::make_shared( + iterator_wrapper = std::make_shared( object_storage.get(), configuration.container, std::nullopt, configuration.blob_path, query_info.query, virtual_block, local_context, nullptr); } else { - iterator_wrapper = std::make_shared( + iterator_wrapper = std::make_shared( object_storage.get(), configuration.container, configuration.blobs_paths, std::nullopt, query_info.query, virtual_block, local_context, nullptr); } @@ -653,7 +653,7 @@ Pipe StorageAzure::read( for (size_t i = 0; i < num_streams; ++i) { - pipes.emplace_back(std::make_shared( + pipes.emplace_back(std::make_shared( requested_virtual_columns, configuration.format, getName(), @@ -671,7 +671,7 @@ Pipe StorageAzure::read( return Pipe::unitePipes(std::move(pipes)); } -SinkToStoragePtr StorageAzure::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) +SinkToStoragePtr StorageAzureBlob::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) { auto sample_block = metadata_snapshot->getSampleBlock(); auto chosen_compression_method = chooseCompressionMethod(configuration.blobs_paths.back(), configuration.compression_method); @@ -682,7 +682,7 @@ SinkToStoragePtr StorageAzure::write(const ASTPtr & query, const StorageMetadata if (is_partitioned_implementation) { - return std::make_shared( + return std::make_shared( partition_by_ast, configuration.format, sample_block, @@ -696,7 +696,7 @@ SinkToStoragePtr StorageAzure::write(const ASTPtr & query, const StorageMetadata { if (configuration.withGlobs()) throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, - "Azure key '{}' contains globs, so the table is in readonly mode", configuration.blob_path); + "AzureBlobStorage key '{}' contains globs, so the table is in readonly mode", configuration.blob_path); bool truncate_in_insert = local_context->getSettingsRef().azure_truncate_on_insert; @@ -730,7 +730,7 @@ SinkToStoragePtr StorageAzure::write(const ASTPtr & query, const StorageMetadata } } - return std::make_shared( + return std::make_shared( configuration.format, sample_block, local_context, @@ -741,32 +741,32 @@ SinkToStoragePtr StorageAzure::write(const ASTPtr & query, const StorageMetadata } } -NamesAndTypesList StorageAzure::getVirtuals() const +NamesAndTypesList StorageAzureBlob::getVirtuals() const { return virtual_columns; } -bool StorageAzure::supportsPartitionBy() const +bool StorageAzureBlob::supportsPartitionBy() const { return true; } -bool StorageAzure::supportsSubcolumns() const +bool StorageAzureBlob::supportsSubcolumns() const { return FormatFactory::instance().checkIfFormatSupportsSubcolumns(configuration.format); } -bool StorageAzure::supportsSubsetOfColumns() const +bool StorageAzureBlob::supportsSubsetOfColumns() const { return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format); } -bool StorageAzure::prefersLargeBlocks() const +bool StorageAzureBlob::prefersLargeBlocks() const { return FormatFactory::instance().checkIfOutputFormatPrefersLargeBlocks(configuration.format); } -bool StorageAzure::parallelizeOutputAfterReading(ContextPtr context) const +bool StorageAzureBlob::parallelizeOutputAfterReading(ContextPtr context) const { return FormatFactory::instance().checkParallelizeOutputAfterReading(configuration.format, context); } @@ -788,7 +788,7 @@ static void addPathToVirtualColumns(Block & block, const String & path, size_t i block.getByName("_idx").column->assumeMutableRef().insert(idx); } -StorageAzureSource::Iterator::Iterator( +StorageAzureBlobSource::Iterator::Iterator( AzureObjectStorage * object_storage_, const std::string & container_, std::optional keys_, @@ -886,7 +886,7 @@ StorageAzureSource::Iterator::Iterator( } -RelativePathWithMetadata StorageAzureSource::Iterator::next() +RelativePathWithMetadata StorageAzureBlobSource::Iterator::next() { if (is_finished) return {}; @@ -971,13 +971,13 @@ RelativePathWithMetadata StorageAzureSource::Iterator::next() } } -size_t StorageAzureSource::Iterator::getTotalSize() const +size_t StorageAzureBlobSource::Iterator::getTotalSize() const { return total_size.load(std::memory_order_relaxed); } -void StorageAzureSource::Iterator::createFilterAST(const String & any_key) +void StorageAzureBlobSource::Iterator::createFilterAST(const String & any_key) { if (!query || !virtual_header) return; @@ -992,7 +992,7 @@ void StorageAzureSource::Iterator::createFilterAST(const String & any_key) } -Chunk StorageAzureSource::generate() +Chunk StorageAzureBlobSource::generate() { while (true) { @@ -1049,7 +1049,7 @@ Chunk StorageAzureSource::generate() return {}; } -Block StorageAzureSource::getHeader(Block sample_block, const std::vector & requested_virtual_columns) +Block StorageAzureBlobSource::getHeader(Block sample_block, const std::vector & requested_virtual_columns) { for (const auto & virtual_column : requested_virtual_columns) sample_block.insert({virtual_column.type->createColumn(), virtual_column.type, virtual_column.name}); @@ -1057,7 +1057,7 @@ Block StorageAzureSource::getHeader(Block sample_block, const std::vector & requested_virtual_columns_, const String & format_, String name_, @@ -1092,17 +1092,17 @@ StorageAzureSource::StorageAzureSource( } -StorageAzureSource::~StorageAzureSource() +StorageAzureBlobSource::~StorageAzureBlobSource() { create_reader_pool.wait(); } -String StorageAzureSource::getName() const +String StorageAzureBlobSource::getName() const { return name; } -StorageAzureSource::ReaderHolder StorageAzureSource::createReader() +StorageAzureBlobSource::ReaderHolder StorageAzureBlobSource::createReader() { auto [current_key, info] = file_iterator->next(); LOG_DEBUG(log, "KEY {} SIZE {}", current_key, info.size_bytes); @@ -1135,12 +1135,12 @@ StorageAzureSource::ReaderHolder StorageAzureSource::createReader() return ReaderHolder{fs::path(container) / current_key, std::move(read_buf), std::move(pipeline), std::move(current_reader)}; } -std::future StorageAzureSource::createReaderAsync() +std::future StorageAzureBlobSource::createReaderAsync() { return create_reader_scheduler([this] { return createReader(); }, Priority{}); } -std::unique_ptr StorageAzureSource::createAzureReadBuffer(const String & key, size_t object_size) +std::unique_ptr StorageAzureBlobSource::createAzureReadBuffer(const String & key, size_t object_size) { auto read_settings = getContext()->getReadSettings().adjustBufferSize(object_size); read_settings.enable_filesystem_cache = false; @@ -1159,23 +1159,23 @@ std::unique_ptr StorageAzureSource::createAzureReadBuffer(const Stri return object_storage->readObject(StoredObject(key), read_settings, {}, object_size); } -ColumnsDescription StorageAzure::getTableStructureFromData( +ColumnsDescription StorageAzureBlob::getTableStructureFromData( AzureObjectStorage * object_storage, const Configuration & configuration, const std::optional & format_settings, ContextPtr ctx) { RelativePathsWithMetadata read_keys; - std::shared_ptr file_iterator; + std::shared_ptr file_iterator; if (configuration.withGlobs()) { - file_iterator = std::make_shared( + file_iterator = std::make_shared( object_storage, configuration.container, std::nullopt, configuration.blob_path, nullptr, Block{}, ctx, &read_keys); } else { - file_iterator = std::make_shared( + file_iterator = std::make_shared( object_storage, configuration.container, configuration.blobs_paths, std::nullopt, nullptr, Block{}, ctx, &read_keys); } @@ -1233,10 +1233,10 @@ ColumnsDescription StorageAzure::getTableStructureFromData( } -std::optional StorageAzure::tryGetColumnsFromCache( +std::optional StorageAzureBlob::tryGetColumnsFromCache( const RelativePathsWithMetadata::const_iterator & begin, const RelativePathsWithMetadata::const_iterator & end, - const StorageAzure::Configuration & configuration, + const StorageAzureBlob::Configuration & configuration, const std::optional & format_settings, const ContextPtr & ctx) { @@ -1260,10 +1260,10 @@ std::optional StorageAzure::tryGetColumnsFromCache( } -void StorageAzure::addColumnsToCache( +void StorageAzureBlob::addColumnsToCache( const RelativePathsWithMetadata & keys, const ColumnsDescription & columns, - const StorageAzure::Configuration & configuration, + const StorageAzureBlob::Configuration & configuration, const std::optional & format_settings, const String & format_name, const ContextPtr & ctx) @@ -1277,14 +1277,14 @@ void StorageAzure::addColumnsToCache( schema_cache.addMany(cache_keys, columns); } -SchemaCache & StorageAzure::getSchemaCache(const ContextPtr & ctx) +SchemaCache & StorageAzureBlob::getSchemaCache(const ContextPtr & ctx) { static SchemaCache schema_cache(ctx->getConfigRef().getUInt("schema_inference_cache_max_elements_for_azure", DEFAULT_SCHEMA_CACHE_ELEMENTS)); return schema_cache; } -std::unique_ptr StorageAzureSource::createAsyncAzureReadBuffer( +std::unique_ptr StorageAzureBlobSource::createAsyncAzureReadBuffer( const String & key, const ReadSettings & read_settings, size_t object_size) { auto modified_settings{read_settings}; diff --git a/src/Storages/StorageAzure.h b/src/Storages/StorageAzureBlob.h similarity index 93% rename from src/Storages/StorageAzure.h rename to src/Storages/StorageAzureBlob.h index 826582ccaa5..6e4dfaf19eb 100644 --- a/src/Storages/StorageAzure.h +++ b/src/Storages/StorageAzureBlob.h @@ -24,7 +24,7 @@ using AzureConnectionString = std::string; using AzureCredentials = std::variant; -class StorageAzure : public IStorage +class StorageAzureBlob : public IStorage { public: @@ -62,7 +62,7 @@ public: std::vector blobs_paths; }; - StorageAzure( + StorageAzureBlob( const Configuration & configuration_, std::unique_ptr && object_storage_, ContextPtr context_, @@ -73,12 +73,12 @@ public: std::optional format_settings_, ASTPtr partition_by_); - static StorageAzure::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file = true); - static AzureClientPtr createClient(StorageAzure::Configuration configuration); + static StorageAzureBlob::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file = true); + static AzureClientPtr createClient(StorageAzureBlob::Configuration configuration); static AzureObjectStorage::SettingsPtr createSettings(ContextPtr local_context); - static void processNamedCollectionResult(StorageAzure::Configuration & configuration, const NamedCollection & collection); + static void processNamedCollectionResult(StorageAzureBlob::Configuration & configuration, const NamedCollection & collection); String getName() const override { @@ -133,7 +133,7 @@ private: static std::optional tryGetColumnsFromCache( const RelativePathsWithMetadata::const_iterator & begin, const RelativePathsWithMetadata::const_iterator & end, - const StorageAzure::Configuration & configuration, + const StorageAzureBlob::Configuration & configuration, const std::optional & format_settings, const ContextPtr & ctx); @@ -148,7 +148,7 @@ private: }; -class StorageAzureSource : public ISource, WithContext +class StorageAzureBlobSource : public ISource, WithContext { public: class Iterator : WithContext @@ -192,7 +192,7 @@ public: bool is_initialized = false; }; - StorageAzureSource( + StorageAzureBlobSource( const std::vector & requested_virtual_columns_, const String & format_, String name_, @@ -206,7 +206,7 @@ public: const String & container_, std::shared_ptr file_iterator_); - ~StorageAzureSource() override; + ~StorageAzureBlobSource() override; Chunk generate() override; diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp index 96131bd884a..986ad07fdde 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -46,9 +46,9 @@ bool isConnectionString(const std::string & candidate) } -StorageAzure::Configuration TableFunctionAzureBlobStorage::parseArgumentsImpl(ASTs & engine_args, const ContextPtr & local_context, bool get_format_from_file) +StorageAzureBlob::Configuration TableFunctionAzureBlobStorage::parseArgumentsImpl(ASTs & engine_args, const ContextPtr & local_context, bool get_format_from_file) { - StorageAzure::Configuration configuration; + StorageAzureBlob::Configuration configuration; /// Supported signatures: /// @@ -57,7 +57,7 @@ StorageAzure::Configuration TableFunctionAzureBlobStorage::parseArgumentsImpl(AS if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) { - StorageAzure::processNamedCollectionResult(configuration, *named_collection); + StorageAzureBlob::processNamedCollectionResult(configuration, *named_collection); configuration.blobs_paths = {configuration.blob_path}; @@ -202,11 +202,11 @@ ColumnsDescription TableFunctionAzureBlobStorage::getActualTableStructure(Contex if (configuration.structure == "auto") { context->checkAccess(getSourceAccessType()); - auto client = StorageAzure::createClient(configuration); - auto settings = StorageAzure::createSettings(context); + auto client = StorageAzureBlob::createClient(configuration); + auto settings = StorageAzureBlob::createSettings(context); auto object_storage = std::make_unique("AzureBlobStorageTableFunction", std::move(client), std::move(settings)); - return StorageAzure::getTableStructureFromData(object_storage.get(), configuration, std::nullopt, context); + return StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, std::nullopt, context); } return parseColumnsListFromString(configuration.structure, context); @@ -219,8 +219,8 @@ bool TableFunctionAzureBlobStorage::supportsReadingSubsetOfColumns() StoragePtr TableFunctionAzureBlobStorage::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const { - auto client = StorageAzure::createClient(configuration); - auto settings = StorageAzure::createSettings(context); + auto client = StorageAzureBlob::createClient(configuration); + auto settings = StorageAzureBlob::createSettings(context); ColumnsDescription columns; if (configuration.structure != "auto") @@ -228,7 +228,7 @@ StoragePtr TableFunctionAzureBlobStorage::executeImpl(const ASTPtr & /*ast_funct else if (!structure_hint.empty()) columns = structure_hint; - StoragePtr storage = std::make_shared( + StoragePtr storage = std::make_shared( configuration, std::make_unique(table_name, std::move(client), std::move(settings)), context, diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.h b/src/TableFunctions/TableFunctionAzureBlobStorage.h index 6f3a3422958..0bb872de3f3 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.h +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.h @@ -5,7 +5,7 @@ #if USE_AZURE_BLOB_STORAGE #include -#include +#include namespace DB @@ -46,7 +46,7 @@ public: return {"_path", "_file"}; } - static StorageAzure::Configuration parseArgumentsImpl(ASTs & args, const ContextPtr & context, bool get_format_from_file = true); + static StorageAzureBlob::Configuration parseArgumentsImpl(ASTs & args, const ContextPtr & context, bool get_format_from_file = true); protected: @@ -61,7 +61,7 @@ protected: ColumnsDescription getActualTableStructure(ContextPtr context) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; - mutable StorageAzure::Configuration configuration; + mutable StorageAzureBlob::Configuration configuration; ColumnsDescription structure_hint; }; From 6ab2a50c393696cf0444fda405ffce6453ecbd09 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 6 Jun 2023 22:48:53 +0200 Subject: [PATCH 0580/1072] Fix two tests and build --- src/Storages/registerStorages.cpp | 4 ++-- tests/queries/0_stateless/01271_show_privileges.reference | 1 + .../0_stateless/02117_show_create_table_system.reference | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Storages/registerStorages.cpp b/src/Storages/registerStorages.cpp index c66cf85a29b..5606e6728d4 100644 --- a/src/Storages/registerStorages.cpp +++ b/src/Storages/registerStorages.cpp @@ -97,7 +97,7 @@ void registerStorageSQLite(StorageFactory & factory); void registerStorageKeeperMap(StorageFactory & factory); #if USE_AZURE_BLOB_STORAGE -void registerStorageAzure(StorageFactory & factory); +void registerStorageAzureBlob(StorageFactory & factory); #endif void registerStorages() @@ -197,7 +197,7 @@ void registerStorages() registerStorageKeeperMap(factory); #if USE_AZURE_BLOB_STORAGE - registerStorageAzure(factory); + registerStorageAzureBlob(factory); #endif } diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index ec245d8b9e0..5d30da5d2ea 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -158,6 +158,7 @@ JDBC [] GLOBAL SOURCES HDFS [] GLOBAL SOURCES S3 [] GLOBAL SOURCES HIVE [] GLOBAL SOURCES +AZURE [] GLOBAL SOURCES SOURCES [] \N ALL CLUSTER [] GLOBAL ALL ALL ['ALL PRIVILEGES'] \N \N diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index 09cc62dac00..e864ba85018 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -297,7 +297,7 @@ CREATE TABLE system.grants ( `user_name` Nullable(String), `role_name` Nullable(String), - `access_type` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION CONTROL' = 96, 'SYSTEM SHUTDOWN' = 97, 'SYSTEM DROP DNS CACHE' = 98, 'SYSTEM DROP MARK CACHE' = 99, 'SYSTEM DROP UNCOMPRESSED CACHE' = 100, 'SYSTEM DROP MMAP CACHE' = 101, 'SYSTEM DROP QUERY CACHE' = 102, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 103, 'SYSTEM DROP FILESYSTEM CACHE' = 104, 'SYSTEM DROP SCHEMA CACHE' = 105, 'SYSTEM DROP S3 CLIENT CACHE' = 106, 'SYSTEM DROP CACHE' = 107, 'SYSTEM RELOAD CONFIG' = 108, 'SYSTEM RELOAD USERS' = 109, 'SYSTEM RELOAD SYMBOLS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH' = 135, 'SYSTEM THREAD FUZZER' = 136, 'SYSTEM UNFREEZE' = 137, 'SYSTEM FAILPOINT' = 138, 'SYSTEM' = 139, 'dictGet' = 140, 'displaySecretsInShowAndSelect' = 141, 'addressToLine' = 142, 'addressToLineWithInlines' = 143, 'addressToSymbol' = 144, 'demangle' = 145, 'INTROSPECTION' = 146, 'FILE' = 147, 'URL' = 148, 'REMOTE' = 149, 'MONGO' = 150, 'MEILISEARCH' = 151, 'MYSQL' = 152, 'POSTGRES' = 153, 'SQLITE' = 154, 'ODBC' = 155, 'JDBC' = 156, 'HDFS' = 157, 'S3' = 158, 'HIVE' = 159, 'SOURCES' = 160, 'CLUSTER' = 161, 'ALL' = 162, 'NONE' = 163), + `access_type` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION CONTROL' = 96, 'SYSTEM SHUTDOWN' = 97, 'SYSTEM DROP DNS CACHE' = 98, 'SYSTEM DROP MARK CACHE' = 99, 'SYSTEM DROP UNCOMPRESSED CACHE' = 100, 'SYSTEM DROP MMAP CACHE' = 101, 'SYSTEM DROP QUERY CACHE' = 102, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 103, 'SYSTEM DROP FILESYSTEM CACHE' = 104, 'SYSTEM DROP SCHEMA CACHE' = 105, 'SYSTEM DROP S3 CLIENT CACHE' = 106, 'SYSTEM DROP CACHE' = 107, 'SYSTEM RELOAD CONFIG' = 108, 'SYSTEM RELOAD USERS' = 109, 'SYSTEM RELOAD SYMBOLS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH' = 135, 'SYSTEM THREAD FUZZER' = 136, 'SYSTEM UNFREEZE' = 137, 'SYSTEM FAILPOINT' = 138, 'SYSTEM' = 139, 'dictGet' = 140, 'displaySecretsInShowAndSelect' = 141, 'addressToLine' = 142, 'addressToLineWithInlines' = 143, 'addressToSymbol' = 144, 'demangle' = 145, 'INTROSPECTION' = 146, 'FILE' = 147, 'URL' = 148, 'REMOTE' = 149, 'MONGO' = 150, 'MEILISEARCH' = 151, 'MYSQL' = 152, 'POSTGRES' = 153, 'SQLITE' = 154, 'ODBC' = 155, 'JDBC' = 156, 'HDFS' = 157, 'S3' = 158, 'HIVE' = 159, 'AZURE' = 160, 'SOURCES' = 161, 'CLUSTER' = 162, 'ALL' = 163, 'NONE' = 164), `database` Nullable(String), `table` Nullable(String), `column` Nullable(String), @@ -581,10 +581,10 @@ ENGINE = SystemPartsColumns COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.privileges ( - `privilege` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION CONTROL' = 96, 'SYSTEM SHUTDOWN' = 97, 'SYSTEM DROP DNS CACHE' = 98, 'SYSTEM DROP MARK CACHE' = 99, 'SYSTEM DROP UNCOMPRESSED CACHE' = 100, 'SYSTEM DROP MMAP CACHE' = 101, 'SYSTEM DROP QUERY CACHE' = 102, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 103, 'SYSTEM DROP FILESYSTEM CACHE' = 104, 'SYSTEM DROP SCHEMA CACHE' = 105, 'SYSTEM DROP S3 CLIENT CACHE' = 106, 'SYSTEM DROP CACHE' = 107, 'SYSTEM RELOAD CONFIG' = 108, 'SYSTEM RELOAD USERS' = 109, 'SYSTEM RELOAD SYMBOLS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH' = 135, 'SYSTEM THREAD FUZZER' = 136, 'SYSTEM UNFREEZE' = 137, 'SYSTEM FAILPOINT' = 138, 'SYSTEM' = 139, 'dictGet' = 140, 'displaySecretsInShowAndSelect' = 141, 'addressToLine' = 142, 'addressToLineWithInlines' = 143, 'addressToSymbol' = 144, 'demangle' = 145, 'INTROSPECTION' = 146, 'FILE' = 147, 'URL' = 148, 'REMOTE' = 149, 'MONGO' = 150, 'MEILISEARCH' = 151, 'MYSQL' = 152, 'POSTGRES' = 153, 'SQLITE' = 154, 'ODBC' = 155, 'JDBC' = 156, 'HDFS' = 157, 'S3' = 158, 'HIVE' = 159, 'SOURCES' = 160, 'CLUSTER' = 161, 'ALL' = 162, 'NONE' = 163), + `privilege` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION CONTROL' = 96, 'SYSTEM SHUTDOWN' = 97, 'SYSTEM DROP DNS CACHE' = 98, 'SYSTEM DROP MARK CACHE' = 99, 'SYSTEM DROP UNCOMPRESSED CACHE' = 100, 'SYSTEM DROP MMAP CACHE' = 101, 'SYSTEM DROP QUERY CACHE' = 102, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 103, 'SYSTEM DROP FILESYSTEM CACHE' = 104, 'SYSTEM DROP SCHEMA CACHE' = 105, 'SYSTEM DROP S3 CLIENT CACHE' = 106, 'SYSTEM DROP CACHE' = 107, 'SYSTEM RELOAD CONFIG' = 108, 'SYSTEM RELOAD USERS' = 109, 'SYSTEM RELOAD SYMBOLS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH' = 135, 'SYSTEM THREAD FUZZER' = 136, 'SYSTEM UNFREEZE' = 137, 'SYSTEM FAILPOINT' = 138, 'SYSTEM' = 139, 'dictGet' = 140, 'displaySecretsInShowAndSelect' = 141, 'addressToLine' = 142, 'addressToLineWithInlines' = 143, 'addressToSymbol' = 144, 'demangle' = 145, 'INTROSPECTION' = 146, 'FILE' = 147, 'URL' = 148, 'REMOTE' = 149, 'MONGO' = 150, 'MEILISEARCH' = 151, 'MYSQL' = 152, 'POSTGRES' = 153, 'SQLITE' = 154, 'ODBC' = 155, 'JDBC' = 156, 'HDFS' = 157, 'S3' = 158, 'HIVE' = 159, 'AZURE' = 160, 'SOURCES' = 161, 'CLUSTER' = 162, 'ALL' = 163, 'NONE' = 164), `aliases` Array(String), `level` Nullable(Enum8('GLOBAL' = 0, 'DATABASE' = 1, 'TABLE' = 2, 'DICTIONARY' = 3, 'VIEW' = 4, 'COLUMN' = 5, 'NAMED_COLLECTION' = 6)), - `parent_group` Nullable(Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION CONTROL' = 96, 'SYSTEM SHUTDOWN' = 97, 'SYSTEM DROP DNS CACHE' = 98, 'SYSTEM DROP MARK CACHE' = 99, 'SYSTEM DROP UNCOMPRESSED CACHE' = 100, 'SYSTEM DROP MMAP CACHE' = 101, 'SYSTEM DROP QUERY CACHE' = 102, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 103, 'SYSTEM DROP FILESYSTEM CACHE' = 104, 'SYSTEM DROP SCHEMA CACHE' = 105, 'SYSTEM DROP S3 CLIENT CACHE' = 106, 'SYSTEM DROP CACHE' = 107, 'SYSTEM RELOAD CONFIG' = 108, 'SYSTEM RELOAD USERS' = 109, 'SYSTEM RELOAD SYMBOLS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH' = 135, 'SYSTEM THREAD FUZZER' = 136, 'SYSTEM UNFREEZE' = 137, 'SYSTEM FAILPOINT' = 138, 'SYSTEM' = 139, 'dictGet' = 140, 'displaySecretsInShowAndSelect' = 141, 'addressToLine' = 142, 'addressToLineWithInlines' = 143, 'addressToSymbol' = 144, 'demangle' = 145, 'INTROSPECTION' = 146, 'FILE' = 147, 'URL' = 148, 'REMOTE' = 149, 'MONGO' = 150, 'MEILISEARCH' = 151, 'MYSQL' = 152, 'POSTGRES' = 153, 'SQLITE' = 154, 'ODBC' = 155, 'JDBC' = 156, 'HDFS' = 157, 'S3' = 158, 'HIVE' = 159, 'SOURCES' = 160, 'CLUSTER' = 161, 'ALL' = 162, 'NONE' = 163)) + `parent_group` Nullable(Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION CONTROL' = 96, 'SYSTEM SHUTDOWN' = 97, 'SYSTEM DROP DNS CACHE' = 98, 'SYSTEM DROP MARK CACHE' = 99, 'SYSTEM DROP UNCOMPRESSED CACHE' = 100, 'SYSTEM DROP MMAP CACHE' = 101, 'SYSTEM DROP QUERY CACHE' = 102, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 103, 'SYSTEM DROP FILESYSTEM CACHE' = 104, 'SYSTEM DROP SCHEMA CACHE' = 105, 'SYSTEM DROP S3 CLIENT CACHE' = 106, 'SYSTEM DROP CACHE' = 107, 'SYSTEM RELOAD CONFIG' = 108, 'SYSTEM RELOAD USERS' = 109, 'SYSTEM RELOAD SYMBOLS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH' = 135, 'SYSTEM THREAD FUZZER' = 136, 'SYSTEM UNFREEZE' = 137, 'SYSTEM FAILPOINT' = 138, 'SYSTEM' = 139, 'dictGet' = 140, 'displaySecretsInShowAndSelect' = 141, 'addressToLine' = 142, 'addressToLineWithInlines' = 143, 'addressToSymbol' = 144, 'demangle' = 145, 'INTROSPECTION' = 146, 'FILE' = 147, 'URL' = 148, 'REMOTE' = 149, 'MONGO' = 150, 'MEILISEARCH' = 151, 'MYSQL' = 152, 'POSTGRES' = 153, 'SQLITE' = 154, 'ODBC' = 155, 'JDBC' = 156, 'HDFS' = 157, 'S3' = 158, 'HIVE' = 159, 'AZURE' = 160, 'SOURCES' = 161, 'CLUSTER' = 162, 'ALL' = 163, 'NONE' = 164)) ) ENGINE = SystemPrivileges COMMENT 'SYSTEM TABLE is built on the fly.' From be8e0487994d70c3c6350cb675475b12dffda8e5 Mon Sep 17 00:00:00 2001 From: johanngan Date: Tue, 6 Jun 2023 16:28:44 -0500 Subject: [PATCH 0581/1072] Revert invalid RegExpTreeDictionary optimization This reverts the following commits: - e77dd810369ad5fcf957393e4fc71a8a6220b04e - e8527e720b2ab12b3327f1e3886aace402a292c6 Additionally, functional tests are added. When scanning complex regexp nodes sequentially with RE2, the old code has an optimization to break out of the loop early upon finding a leaf node that matches. This is an invalid optimization because there's no guarantee that it's actually a VALID match, because its parents might NOT have matched. Semantically, a user would expect this match to be discarded and for the search to continue. Instead, since we skipped matching after the first false positive, subsequent nodes that would have matched are missing from the output value. This affects both dictGet and dictGetAll. It's difficult to distinguish a true positive from a false positive while looping through complex_regexp_nodes because we would have to scan all the parents of a matching node to confirm a true positive. Trying to do this might actually end up being slower than just scanning every complex regexp node, because complex_regexp_nodes is only a subset of all the tree nodes; we may end up duplicating work with scanning that Vectorscan has already done, depending on whether the parent nodes are "simple" or "complex". So instead of trying to fix this optimization, just remove it entirely. --- src/Dictionaries/RegExpTreeDictionary.cpp | 14 ---- ...04_regexp_dictionary_yaml_source.reference | 16 +++++ .../02504_regexp_dictionary_yaml_source.sh | 64 +++++++++++++++++++ 3 files changed, 80 insertions(+), 14 deletions(-) diff --git a/src/Dictionaries/RegExpTreeDictionary.cpp b/src/Dictionaries/RegExpTreeDictionary.cpp index 8d0af9b0abf..3852cca6928 100644 --- a/src/Dictionaries/RegExpTreeDictionary.cpp +++ b/src/Dictionaries/RegExpTreeDictionary.cpp @@ -129,17 +129,6 @@ struct RegExpTreeDictionary::RegexTreeNode return searcher.Match(haystack, 0, size, re2_st::RE2::Anchor::UNANCHORED, nullptr, 0); } - /// check if this node can cover all the attributes from the query. - bool containsAll(const std::unordered_map & matching_attributes) const - { - for (const auto & [key, value] : matching_attributes) - { - if (!attributes.contains(key)) - return false; - } - return true; - } - struct AttributeValue { Field field; @@ -691,9 +680,6 @@ std::unordered_map RegExpTreeDictionary::match( if (node_ptr->match(reinterpret_cast(keys_data.data()) + offset, length)) { match_result.insertNodeID(node_ptr->id); - /// When this node is leaf and contains all the required attributes, it means a match. - if (node_ptr->containsAll(attributes) && node_ptr->children.empty()) - break; } } diff --git a/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.reference b/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.reference index 437012dd516..79871e3716c 100644 --- a/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.reference +++ b/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.reference @@ -11,3 +11,19 @@ (['ClickHouse Documentation','ClickHouse'],[0,1],['/en'],['ClickHouse']) (['Documentation','GitHub'],[2,3],[NULL],[]) (['Documentation','GitHub'],[2,3],[NULL],[]) +ClickHouse +['ClickHouse'] +ClickHouse Documentation +['ClickHouse Documentation','ClickHouse','Documentation'] +GitHub Documentation +['GitHub Documentation','GitHub'] +Documentation +['Documentation'] +ClickHouse +['ClickHouse'] +ClickHouse Documentation +['ClickHouse Documentation','ClickHouse','Documentation'] +GitHub Documentation +['GitHub Documentation','GitHub'] +Documentation +['Documentation'] diff --git a/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh b/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh index ac0793460a9..5e8985406ae 100755 --- a/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh +++ b/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh @@ -175,6 +175,70 @@ select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'pare select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'github.com/clickhouse/tree/master/docs', 2); " +# Test that things work the same for "simple" regexps that go through Hyperscan and "complex" regexps that go through RE2. +# An easy way to force the use of RE2 is to disable Hyperscan. +# This tree is constructed purposely so that text might (falsely) match leaf nodes without matching their corresponding parent nodes +cat > "$yaml" < #include #include +#include #include #include #include @@ -948,25 +949,52 @@ RangesInDataParts MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipInd std::list useful_indices; std::map, MergedDataSkippingIndexAndCondition> merged_indices; + std::unordered_set ignored_index_names; + + if (use_skip_indexes && settings.ignore_data_skipping_indices.changed) + { + const auto & indices = settings.ignore_data_skipping_indices.toString(); + Tokens tokens(indices.data(), &indices[indices.size()], settings.max_query_size); + IParser::Pos pos(tokens, static_cast(settings.max_parser_depth)); + Expected expected; + + /// Use an unordered list rather than string vector + auto parse_single_id_or_literal = [&] + { + String str; + if (!parseIdentifierOrStringLiteral(pos, expected, str)) + return false; + + ignored_index_names.insert(std::move(str)); + return true; + }; + + if (!ParserList::parseUtil(pos, expected, parse_single_id_or_literal, false)) + throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Cannot parse ignore_data_skipping_indices ('{}')", indices); + } if (use_skip_indexes) { for (const auto & index : metadata_snapshot->getSecondaryIndices()) { - auto index_helper = MergeTreeIndexFactory::instance().get(index); - if (index_helper->isMergeable()) - { - auto [it, inserted] = merged_indices.try_emplace({index_helper->index.type, index_helper->getGranularity()}); - if (inserted) - it->second.condition = index_helper->createIndexMergedCondition(query_info, metadata_snapshot); - it->second.addIndex(index_helper); - } - else + auto index_helper = MergeTreeIndexFactory::instance().get(index); + if(!ignored_index_names.contains(index.name)) { - auto condition = index_helper->createIndexCondition(query_info, context); - if (!condition->alwaysUnknownOrTrue()) - useful_indices.emplace_back(index_helper, condition); + if (index_helper->isMergeable()) + { + auto [it, inserted] = merged_indices.try_emplace({index_helper->index.type, index_helper->getGranularity()}); + if (inserted) + it->second.condition = index_helper->createIndexMergedCondition(query_info, metadata_snapshot); + + it->second.addIndex(index_helper); + } + else + { + auto condition = index_helper->createIndexCondition(query_info, context); + if (!condition->alwaysUnknownOrTrue()) + useful_indices.emplace_back(index_helper, condition); + } } } } diff --git a/tests/queries/0_stateless/02771_ignore_data_skipping_indices.reference b/tests/queries/0_stateless/02771_ignore_data_skipping_indices.reference new file mode 100644 index 00000000000..af1fce5ba13 --- /dev/null +++ b/tests/queries/0_stateless/02771_ignore_data_skipping_indices.reference @@ -0,0 +1,44 @@ +1 2 3 +1 2 3 +1 2 3 +Expression ((Projection + Before ORDER BY)) + Filter (WHERE) + ReadFromMergeTree (default.data_02771) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 + Skip + Name: x_idx + Description: minmax GRANULARITY 1 + Parts: 0/1 + Granules: 0/1 + Skip + Name: y_idx + Description: minmax GRANULARITY 1 + Parts: 0/0 + Granules: 0/0 + Skip + Name: xy_idx + Description: minmax GRANULARITY 1 + Parts: 0/0 + Granules: 0/0 +Expression ((Projection + Before ORDER BY)) + Filter (WHERE) + ReadFromMergeTree (default.data_02771) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 + Skip + Name: x_idx + Description: minmax GRANULARITY 1 + Parts: 0/1 + Granules: 0/1 + Skip + Name: y_idx + Description: minmax GRANULARITY 1 + Parts: 0/0 + Granules: 0/0 diff --git a/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql b/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql new file mode 100644 index 00000000000..ab314ae5ea2 --- /dev/null +++ b/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql @@ -0,0 +1,26 @@ +DROP TABLE IF EXISTS data_02771; +CREATE TABLE data_02771 +( + key Int, + x Int, + y Int, + INDEX x_idx x TYPE minmax GRANULARITY 1, + INDEX y_idx y TYPE minmax GRANULARITY 1, + INDEX xy_idx (x,y) TYPE minmax GRANULARITY 1 +) +Engine=MergeTree() +ORDER BY key; + +INSERT INTO data_02771 VALUES (1, 2, 3); + +SELECT * FROM data_02771; +SELECT * FROM data_02771 SETTINGS ignore_data_skipping_indices=''; -- { serverError 6 } +SELECT * FROM data_02771 SETTINGS ignore_data_skipping_indices='x_idx'; +SELECT * FROM data_02771 SETTINGS ignore_data_skipping_indices='na_idx'; + +SELECT * FROM data_02771 WHERE x = 1 AND y = 1 SETTINGS ignore_data_skipping_indices='xy_idx',force_data_skipping_indices='xy_idx' ; -- { serverError 277 } +SELECT * FROM data_02771 WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx'; +EXPLAIN indexes = 1 SELECT * FROM data_02771 WHERE x = 1 AND y = 2; +EXPLAIN indexes = 1 SELECT * FROM data_02771 WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx'; + +DROP TABLE data_02771; From f552b96451bd4c826a9e7d1bff669301c3c4bccc Mon Sep 17 00:00:00 2001 From: Boris Kuschel Date: Mon, 29 May 2023 14:00:00 -0700 Subject: [PATCH 0586/1072] Add docs for ignore index --- docs/en/operations/settings/settings.md | 83 +++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 5730503a670..5b0c6b3c8c2 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -227,6 +227,89 @@ SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='`d1_ SELECT * FROM data_01515 WHERE d1 = 0 AND assumeNotNull(d1_null) = 0 SETTINGS force_data_skipping_indices='`d1_idx`, d1_null_idx'; -- Ok. ``` +## ignore_data_skipping_indices {#settings-ignore_data_skipping_indices} + +Ignores the skipping indexes specified if used by the query. + +Consider the following example: + +```sql +CREATE TABLE data +( + key Int, + x Int, + y Int, + INDEX x_idx x TYPE minmax GRANULARITY 1, + INDEX y_idx y TYPE minmax GRANULARITY 1, + INDEX xy_idx (x,y) TYPE minmax GRANULARITY 1 +) +Engine=MergeTree() +ORDER BY key; + +INSERT INTO data VALUES (1, 2, 3); + +SELECT * FROM data; +SELECT * FROM data SETTINGS ignore_data_skipping_indices=''; -- query will produce CANNOT_PARSE_TEXT error. +SELECT * FROM data SETTINGS ignore_data_skipping_indices='x_idx'; -- Ok. +SELECT * FROM data SETTINGS ignore_data_skipping_indices='na_idx'; -- Ok. + +SELECT * FROM data WHERE x = 1 AND y = 1 SETTINGS ignore_data_skipping_indices='xy_idx',force_data_skipping_indices='xy_idx' ; -- query will produce INDEX_NOT_USED error, since xy_idx is explictly ignored. +SELECT * FROM data WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx'; +``` + +The query without ignoring any indexes: +```sql +EXPLAIN indexes = 1 SELECT * FROM data WHERE x = 1 AND y = 2; + +Expression ((Projection + Before ORDER BY)) + Filter (WHERE) + ReadFromMergeTree (default.data) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 + Skip + Name: x_idx + Description: minmax GRANULARITY 1 + Parts: 0/1 + Granules: 0/1 + Skip + Name: y_idx + Description: minmax GRANULARITY 1 + Parts: 0/0 + Granules: 0/0 + Skip + Name: xy_idx + Description: minmax GRANULARITY 1 + Parts: 0/0 + Granules: 0/0 +``` + +Ignoring the `xy_idx` index: +```sql +EXPLAIN indexes = 1 SELECT * FROM data WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx'; + +Expression ((Projection + Before ORDER BY)) + Filter (WHERE) + ReadFromMergeTree (default.data) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 + Skip + Name: x_idx + Description: minmax GRANULARITY 1 + Parts: 0/1 + Granules: 0/1 + Skip + Name: y_idx + Description: minmax GRANULARITY 1 + Parts: 0/0 + Granules: 0/0 +``` + Works with tables in the MergeTree family. ## convert_query_to_cnf {#convert_query_to_cnf} From 689e0cabe0ca3cfc4b710a7426dbb1d363437984 Mon Sep 17 00:00:00 2001 From: Boris Kuschel Date: Tue, 30 May 2023 05:31:54 -0700 Subject: [PATCH 0587/1072] Add space to if --- src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index ca6ab931499..c07d887588b 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -979,7 +979,7 @@ RangesInDataParts MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipInd { auto index_helper = MergeTreeIndexFactory::instance().get(index); - if(!ignored_index_names.contains(index.name)) + if (!ignored_index_names.contains(index.name)) { if (index_helper->isMergeable()) { From 7c2b88a00eb1972fbd27b534ad578c2e69486605 Mon Sep 17 00:00:00 2001 From: Boris Kuschel Date: Thu, 1 Jun 2023 07:14:39 -0700 Subject: [PATCH 0588/1072] Make test invariant --- .../02771_ignore_data_skipping_indices.reference | 4 ---- .../0_stateless/02771_ignore_data_skipping_indices.sql | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/queries/0_stateless/02771_ignore_data_skipping_indices.reference b/tests/queries/0_stateless/02771_ignore_data_skipping_indices.reference index af1fce5ba13..786360783fd 100644 --- a/tests/queries/0_stateless/02771_ignore_data_skipping_indices.reference +++ b/tests/queries/0_stateless/02771_ignore_data_skipping_indices.reference @@ -1,8 +1,6 @@ 1 2 3 1 2 3 1 2 3 -Expression ((Projection + Before ORDER BY)) - Filter (WHERE) ReadFromMergeTree (default.data_02771) Indexes: PrimaryKey @@ -24,8 +22,6 @@ Expression ((Projection + Before ORDER BY)) Description: minmax GRANULARITY 1 Parts: 0/0 Granules: 0/0 -Expression ((Projection + Before ORDER BY)) - Filter (WHERE) ReadFromMergeTree (default.data_02771) Indexes: PrimaryKey diff --git a/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql b/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql index ab314ae5ea2..baa2d497863 100644 --- a/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql +++ b/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql @@ -20,7 +20,7 @@ SELECT * FROM data_02771 SETTINGS ignore_data_skipping_indices='na_idx'; SELECT * FROM data_02771 WHERE x = 1 AND y = 1 SETTINGS ignore_data_skipping_indices='xy_idx',force_data_skipping_indices='xy_idx' ; -- { serverError 277 } SELECT * FROM data_02771 WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx'; -EXPLAIN indexes = 1 SELECT * FROM data_02771 WHERE x = 1 AND y = 2; -EXPLAIN indexes = 1 SELECT * FROM data_02771 WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx'; +SELECT * from ( EXPLAIN indexes = 1 SELECT * FROM data_02771 WHERE x = 1 AND y = 2 ) WHERE explain NOT LIKE '%Expression%' AND explain NOT LIKE '%Filter%'; +SELECT * from ( EXPLAIN indexes = 1 SELECT * FROM data_02771 WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx' ) WHERE explain NOT LIKE '%Expression%' AND explain NOT LIKE '%Filter%'; -DROP TABLE data_02771; +DROP TABLE data_02771; \ No newline at end of file From 1fa1215d1549e5887695cfd0f6bf4aaa61101fec Mon Sep 17 00:00:00 2001 From: Boris Kuschel Date: Fri, 2 Jun 2023 07:21:22 -0700 Subject: [PATCH 0589/1072] Avoid UB --- src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp | 2 +- .../queries/0_stateless/02771_ignore_data_skipping_indices.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index c07d887588b..4967de8424b 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -954,7 +954,7 @@ RangesInDataParts MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipInd if (use_skip_indexes && settings.ignore_data_skipping_indices.changed) { const auto & indices = settings.ignore_data_skipping_indices.toString(); - Tokens tokens(indices.data(), &indices[indices.size()], settings.max_query_size); + Tokens tokens(indices.data(), indices.data() + indices.size(), settings.max_query_size); IParser::Pos pos(tokens, static_cast(settings.max_parser_depth)); Expected expected; diff --git a/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql b/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql index baa2d497863..289d5240b57 100644 --- a/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql +++ b/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql @@ -23,4 +23,4 @@ SELECT * FROM data_02771 WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_ind SELECT * from ( EXPLAIN indexes = 1 SELECT * FROM data_02771 WHERE x = 1 AND y = 2 ) WHERE explain NOT LIKE '%Expression%' AND explain NOT LIKE '%Filter%'; SELECT * from ( EXPLAIN indexes = 1 SELECT * FROM data_02771 WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx' ) WHERE explain NOT LIKE '%Expression%' AND explain NOT LIKE '%Filter%'; -DROP TABLE data_02771; \ No newline at end of file +DROP TABLE data_02771; From 45d000b71780cb7a022c6c0694d978596ac8fb96 Mon Sep 17 00:00:00 2001 From: Boris Kuschel Date: Mon, 5 Jun 2023 05:27:44 -0700 Subject: [PATCH 0590/1072] Turn off analyzer for test --- .../0_stateless/02771_ignore_data_skipping_indices.sql | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql b/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql index 289d5240b57..a49239e9de2 100644 --- a/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql +++ b/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql @@ -1,4 +1,8 @@ +SET allow_experimental_analyzer = 0; + DROP TABLE IF EXISTS data_02771; + + CREATE TABLE data_02771 ( key Int, From f3959aa9e16fd50ad5e7081c12a9a9948113e898 Mon Sep 17 00:00:00 2001 From: Derek Chia Date: Wed, 7 Jun 2023 11:07:16 +0800 Subject: [PATCH 0591/1072] Update settings.md `max_final_threads` is now set to the number of cores by default. See https://github.com/ClickHouse/ClickHouse/pull/47915 --- docs/en/operations/settings/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 5730503a670..8e2cd8d6027 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3155,7 +3155,7 @@ Possible values: - Positive integer. - 0 or 1 — Disabled. `SELECT` queries are executed in a single thread. -Default value: `16`. +Default value: the number of physical CPU cores. ## opentelemetry_start_trace_probability {#opentelemetry-start-trace-probability} From 23a30268369c3166965d34815fd963db33740a64 Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Wed, 7 Jun 2023 03:16:29 +0000 Subject: [PATCH 0592/1072] Implemented connection string --- docs/en/interfaces/cli.md | 110 +++++++++ docs/ru/interfaces/cli.md | 110 +++++++++ programs/client/Client.cpp | 13 +- src/Client/ConnectionString.cpp | 219 ++++++++++++++++++ src/Client/ConnectionString.h | 22 ++ .../02784_connection_string.reference | 125 ++++++++++ .../0_stateless/02784_connection_string.sh | 156 +++++++++++++ 7 files changed, 753 insertions(+), 2 deletions(-) create mode 100644 src/Client/ConnectionString.cpp create mode 100644 src/Client/ConnectionString.h create mode 100644 tests/queries/0_stateless/02784_connection_string.reference create mode 100755 tests/queries/0_stateless/02784_connection_string.sh diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index f670d464006..5255657ddfd 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -158,6 +158,116 @@ $ clickhouse-client --param_tuple_in_tuple="(10, ('dt', 10))" -q "SELECT * FROM $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="number" --query "SELECT {col:Identifier} FROM {db:Identifier}.{tbl:Identifier} LIMIT 10" ``` +## Connection string {#connection_string} + +The connection string for clickhouse-client is presented in URI format: + +```text +clickhouse://[user_info@][hosts_and_ports][/dbname][?query_parameters] +``` + +where user_info is: ```user[:password]``` +and hosts_and_ports is a list of values: ```[host][:port],[host][:port]``` Port is not mandatory. +and query_parameters is a list of parameter[=value]: ```param_name[=value]¶m_name[=value]...``` value may not be required for some of parameters. + +Allowed query_parameters keys: + +- **secure** or shorthanded **s** - no value. If specified, client will connect to the server over a secure connection (TLS). See **secure** in [command-line-options](#command-line-options) + +These examples illustrate valid connection strings for clickhouse-client: + +```text +clickhouse: +clickhouse://localhost +clickhouse://localhost:9000 +clickhouse://localhost/default +clickhouse://default@localhost +clickhouse://user:password@localhost +clickhouse://user_name@localhost/some_database?secure +clickhouse://host1:9000,host2:5000/some_database +``` + +The host component can either be an IP address or a host name. Put an IPv6 address in square brackets to specify it: + +```text +clickhouse://[2001:db8::1234] +``` + +If user or/and password are not specified, default values will be used. +If host is not specified, the default host will be used (localhost). +If port is not specified, the default port will be used (9000). +If database is not specified, the default database will be used. + +User, password, and database can be specified in the connection string either in --user command line option. + +The connection string must be specified in the first argument of clickhouse-client. The connection string can be combined with other [command-line-options](#command-line-options) except **--host(h)** and **--port**. + +### Multiple hosts {#connection_string_multiple_hosts} + +URI allows multiple hosts to be connected to, and the client will try to connect to those hosts using the order from URI and command line options. The hosts and ports in the URI accept comma-separated lists of values. + +If more than one host is supplied, or if a single host name is translated to more than one address, each host and address will be attempted one at a time until one is successful. The remaining hosts after successful connection in the list are not tried. + +### Percent encoding {#connection_string_uri_percent_encoding} + +Hosts, user name, password, database and query parameters should be [Percent-Encoded](https://en.wikipedia.org/wiki/URL_encoding) if values contain URI invalid characters. + +### Examples {#connection_string_examples} + +Connect to localhost using port 9000 and executes the query "SELECT 1". + +``` bash +clickhouse-client "clickhouse://localhost:9000" --query "SELECT 1" +``` + +Connect to localhost using port 9000 in interactive, multiline mode. + +``` bash +clickhouse-client "clickhouse://localhost:9000" -m +``` + +Connect to localhost using port 9000 in interactive mode with the user specified in --user option. + +``` bash +clickhouse-client "clickhouse://localhost:9000" --user default +``` + +Connect to localhost using port 9000 in interactive mode with database 'my_database' specified in command line option + +``` bash +clickhouse-client "clickhouse://localhost:9000" --database my_database +``` + +Connect to localhost using port 9000 in interactive mode with the database specified in the connection string. + +``` bash +clickhouse-client "clickhouse://localhost:9000/my_database" +``` + +Connect to localhost using port 9000 in interactive mode with a database specified in the connection string and a secure connection using shorthanded 's' URI parameter. + +```bash +clickhouse-client "clickhouse://localhost/my_database?s" +``` + +Connect to default host using the default port, default user, and default database. + +``` bash +clickhouse-client "clickhouse:" +``` + +Connect to the default host using the default port, using user user_name and no password. + +``` bash +clickhouse-client "clickhouse://user_name@" +``` + +Connect to localhost using email user name. Symbol '@' is percent encoded to '%40'. + +``` bash +clickhouse-client "clickhouse://some_user%40some_mail.com@localhost:9000" +``` + ## Configuring {#interfaces_cli_configuration} You can pass parameters to `clickhouse-client` (all parameters have a default value) using: diff --git a/docs/ru/interfaces/cli.md b/docs/ru/interfaces/cli.md index 4c22eae0207..06642800cc6 100644 --- a/docs/ru/interfaces/cli.md +++ b/docs/ru/interfaces/cli.md @@ -110,6 +110,116 @@ $ clickhouse-client --param_tuple_in_tuple="(10, ('dt', 10))" -q "SELECT * FROM $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="number" --query "SELECT {col:Identifier} FROM {db:Identifier}.{tbl:Identifier} LIMIT 10" ``` +## Строка подключения {#connection_string} + +Строка подключения для clickhouse-client представлена в формате URI: + +```text +clickhouse://[user_info@][hosts_and_ports][/dbname][?query_parameters] +``` + +где user_info - это: ```user[:password]``` +hosts_and_ports - это список значений: ```[host][:port],[host][:port]```. Port может быть не задан. +query_parameters - это список пар ключ[=значение]: ```param_name[=value]¶m_name[=value]...```. Значение может быть пустым + +Допустимые ключи query_parameters: + +- **secure** или сокращенно **s** - без значение. Если параметр указан, то соединение с сервером будет осуществляться по защищенному каналу (TLS). См. **secure** в [command-line-options](#command-line-options). + +Эти примеры иллюстрируют допустимые строки подключения для clickhouse-client: + +```text +clickhouse: +clickhouse://localhost +clickhouse://localhost:9000 +clickhouse://localhost/default +clickhouse://default@localhost +clickhouse://user:password@localhost +clickhouse://имя_пользователя@localhost/some_database?secure +clickhouse://host1:9000,host2:5000/some_database +``` + +Параметр host может быть либо IP-адресом, либо именем хоста. Для указания IPv6-адреса поместите его в квадратные скобки: + +```text +clickhouse://[2001:db8::1234] +``` + +Если пользователь или/и пароль не указаны, будут использоваться значения по умолчанию. +Если host не указан, будет использован хост по умолчанию (localhost). +Если port не указан, будет использоваться порт по умолчанию (9000). +Если база данных не указана, будет использоваться база данных по умолчанию (default). + +Пользователь, пароль и база данных могут быть указаны в строке подключения либо в опциях командной строки --user, --password, --database. + +Строка подключения должна быть указана в первом аргументе clickhouse-client. Строка подключения может комбинироваться с другими [параметрами командной строки] (#command-line-options) кроме **--host(h)** и **--port**. + +### Несколько хостов {#connection_string_multiple_hosts} + +URI позволяет подключаться к нескольким хостам, и клиент будет пытаться подключиться к этим хостам, используя порядок из URI и опций командной строки. Хосты и порты в URI принимают списки значений, разделенные запятыми. + +Если указано более одного хоста или если одно имя хоста транслируется в несколько адресов, Клиент будет будет пытаться подключится к каждому хосту и адресу в порядке в котором они встречаются в URI И опциях клиента, пока не будет установлено соединение. Соединение разрывается, если соединение установлено и аутентификация прошла успешно, остальные хосты в списке игнорируются. + +### Кодирование URI {#connection_string_uri_percent_encoding} + +Хосты, имя пользователя, пароль, имя базы данных, и параметры запроса должны быть [закодированы](https://ru.wikipedia.org/wiki/URL#%D0%9A%D0%BE%D0%B4%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_URL), если значения содержат невалидные символы URI. + +### Примеры {#connection_string_examples} + +Подключиться к localhost через порт 9000 и выполнить запрос "SELECT 1" + +``` bash +clickhouse-client "clickhouse://localhost:9000" --query "SELECT 1" +``` + +Подключиться к localhost через порт 9000 в интерактивном, многострочном режиме. + +``` bash +clickhouse-client "clickhouse://localhost:9000" -m +``` + +Подключиться к localhost через порт 9000 в интерактивном режиме с пользователем default, указанным в опции --user. + +``` bash +clickhouse-client "clickhouse://localhost:9000" --user default +``` + +Подключиться к localhost, используя порт 9000 в интерактивном режиме с базой данных 'my_database', указанной в опции командной строки. + +``` bash +clickhouse-client "clickhouse://localhost:9000" --database my_database +``` + +Подключиться к localhost через порт 9000 в интерактивном режиме с базой данных my_database, указанной в строке подключения. + +``` bash +clickhouse-client "clickhouse://localhost:9000/my_database" +``` + +Подключиться к localhost через порт 9000 в интерактивном режиме с базой данных, указанной в строке подключения, и безопасным соединением с использованием сокращенного параметра URI 's'. + +``` bash +clickhouse-client "clickhouse://localhost/my_database?s" +``` + +Подключиться к хосту по умолчанию с использованием порта по умолчанию, пользователя по умолчанию, и базы данных по умолчанию. + +``` bash +clickhouse-client "clickhouse:" +``` + +Подключиться к хосту по умолчанию через порт по умолчанию, используя имя пользователя user_name без пароля. + +``` bash +clickhouse-client "clickhouse://user_name@" +``` + +Подключиться к localhost, используя электронную почту, как имя пользователя. Символ '@' закодирован как '%40'. + +``` bash +clickhouse-client "clickhouse://some_user%40some_mail.com@localhost:9000" +``` + ## Конфигурирование {#interfaces_cli_configuration} В `clickhouse-client` можно передавать различные параметры (все параметры имеют значения по умолчанию) с помощью: diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 32a07284d26..e513314387f 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -5,13 +5,13 @@ #include #include #include -#include #include #include #include #include #include #include "Client.h" +#include "Client/ConnectionString.h" #include "Core/Protocol.h" #include "Parsers/formatAST.h" @@ -1248,6 +1248,9 @@ void Client::readArguments( std::vector & external_tables_arguments, std::vector & hosts_and_ports_arguments) { + bool has_connection_string = argc >= 2 && tryParseConnectionString(std::string_view(argv[1]), common_arguments, hosts_and_ports_arguments); + int start_argument_index = has_connection_string ? 2 : 1; + /** We allow different groups of arguments: * - common arguments; * - arguments for any number of external tables each in form "--external args...", @@ -1260,7 +1263,7 @@ void Client::readArguments( std::string prev_host_arg; std::string prev_port_arg; - for (int arg_num = 1; arg_num < argc; ++arg_num) + for (int arg_num = start_argument_index; arg_num < argc; ++arg_num) { std::string_view arg = argv[arg_num]; @@ -1322,6 +1325,9 @@ void Client::readArguments( } else if (arg.starts_with("--host") || arg.starts_with("-h")) { + if (has_connection_string) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Mixing connection string and --host/--port client arguments is prohibited"); + std::string host_arg; /// --host host if (arg == "--host" || arg == "-h") @@ -1353,6 +1359,9 @@ void Client::readArguments( } else if (arg.starts_with("--port")) { + if (has_connection_string) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Mixing connection string and --host/--port client arguments is prohibited"); + auto port_arg = String{arg}; /// --port port if (arg == "--port") diff --git a/src/Client/ConnectionString.cpp b/src/Client/ConnectionString.cpp new file mode 100644 index 00000000000..a8b87726a65 --- /dev/null +++ b/src/Client/ConnectionString.cpp @@ -0,0 +1,219 @@ +#include "ConnectionString.h" + +#include +#include +#include + +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +} + +namespace +{ + +using namespace std::string_literals; +using namespace std::literals::string_view_literals; + +constexpr auto CONNECTION_URI_SCHEME = "clickhouse:"sv; + +void uriDecode(std::string & uri_encoded_string, bool plus_as_space) +{ + std::string temp; + Poco::URI::decode(uri_encoded_string, temp, plus_as_space); + std::swap(temp, uri_encoded_string); +} + +void getHostAndPort(const Poco::URI & uri, std::vector> & hosts_and_ports_arguments) +{ + auto host = uri.getHost(); + std::vector host_and_port; + if (!host.empty()) + { + uriDecode(host, false); + host_and_port.push_back("--host="s + host); + } + + // Port can be written without host (":9000"). Empty host name equals to default host. + auto port = uri.getPort(); + if (port != 0) + host_and_port.push_back("--port="s + std::to_string(port)); + + if (!host_and_port.empty()) + hosts_and_ports_arguments.push_back(std::move(host_and_port)); +} + +void getHostAndPort( + Poco::URI & uri, + std::vector> & hosts_and_ports_arguments, + const char * host_begin, + const char * host_end, + const char * right_part_start, + const char * connection_string_end) +{ + // User info does not matter in sub URI + std::string uri_string = {CONNECTION_URI_SCHEME.begin(), CONNECTION_URI_SCHEME.end()}; + if (host_begin != nullptr && host_begin != host_end) + { + uri_string.append("//"); + uri_string.append(host_begin, host_end); + } + + // Right part from string includes '/database?[params]' + uri_string.append(right_part_start, connection_string_end); + try + { + uri = Poco::URI(uri_string); + } + catch (const Poco::URISyntaxException & invalid_uri_exception) + { + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, + "Invalid connection string syntax {}: {}", uri_string, invalid_uri_exception.what()); + } + + getHostAndPort(uri, hosts_and_ports_arguments); +} + +std::string makeArgument(const std::string & connection_string_parameter_name) +{ + return (connection_string_parameter_name.size() == 1 ? "-"s : "--"s) + connection_string_parameter_name; +} + +} + +namespace DB +{ + +bool tryParseConnectionString( + std::string_view connection_string, + std::vector & common_arguments, + std::vector> & hosts_and_ports_arguments) +{ + if (!connection_string.starts_with(CONNECTION_URI_SCHEME)) + return false; + + if (connection_string.size() == CONNECTION_URI_SCHEME.size()) + return true; + + auto offset = CONNECTION_URI_SCHEME.size(); + if ((connection_string.substr(offset).starts_with("//"))) + offset += 2; + + auto hosts_end_pos = std::string_view::npos; + auto hosts_or_user_info_end_pos = connection_string.find_first_of("?/@", offset); + + auto has_user_info = hosts_or_user_info_end_pos != std::string_view::npos && connection_string[hosts_or_user_info_end_pos] == '@'; + if (has_user_info) + { + // Move offset right after user info + offset = hosts_or_user_info_end_pos + 1; + hosts_end_pos = connection_string.find_first_of("?/@", offset); + // Several '@' symbols in connection string is prohibited. + // If user name contains '@' then it should be percent-encoded. + // several users: 'usr1@host1,@usr2@host2' is invalid. + if (hosts_end_pos != std::string_view::npos && connection_string[hosts_end_pos] == '@') + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Symbols '@' in URI in password or user name should be percent-encoded. Individual user names for different hosts also prohibited. {}", + connection_string); + } + } + else + hosts_end_pos = hosts_or_user_info_end_pos; + + auto hosts_end = hosts_end_pos != std::string_view::npos ? connection_string.begin() + hosts_end_pos + : connection_string.end(); + + try + { + // Poco::URI doesn't support several hosts in URI. + // Split string clickhouse:[user_info]host1:port1, ... , hostN:portN[database]?[query_parameters] + // into multiple string for each host: + // clickhouse:[user_info]host1:port1[database]?[query_parameters] + // ... + // clickhouse:[user_info]hostN:portN[database]?[query_parameters] + Poco::URI uri; + auto last_host_begin = connection_string.begin() + offset; + for (auto it = last_host_begin; it != hosts_end; ++it) + { + if (*it == ',') + { + getHostAndPort(uri, hosts_and_ports_arguments, last_host_begin, it, hosts_end, connection_string.end()); + last_host_begin = it + 1; + } + } + + if (uri.empty()) + { + // URI has no host specified + uri = std::string{connection_string.begin(), connection_string.end()}; + getHostAndPort(uri, hosts_and_ports_arguments); + } + else + getHostAndPort(uri, hosts_and_ports_arguments, last_host_begin, hosts_end, hosts_end, connection_string.end()); + + Poco::URI::QueryParameters params = uri.getQueryParameters(); + for (const auto & param : params) + { + if (param.first == "secure" || param.first == "s") + { + if (!param.second.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "secure URI argument does not require value"); + + common_arguments.push_back(makeArgument(param.first)); + } + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "URI argument {} is unknown", param.first); + } + + auto user_info = uri.getUserInfo(); + if (!user_info.empty()) + { + // Poco::URI doesn't decode user name/password by default. + // But ClickHouse allows to have users with email user name like: 'john@some_mail.com' + // john@some_mail.com should be percent-encoded: 'john%40some_mail.com' + uriDecode(user_info, true); + std::string::size_type pos = user_info.find(':'); + if (pos != std::string::npos) + { + common_arguments.push_back("--user"); + common_arguments.push_back(user_info.substr(0, pos)); + + ++pos; // Skip ':' + common_arguments.push_back("--password"); + common_arguments.push_back(user_info.substr(pos)); + } + else + { + common_arguments.push_back("--user"); + common_arguments.push_back(user_info); + } + } + + const auto & database_name = uri.getPath(); + size_t start_symbol = database_name.size() > 0u && database_name[0] == '/' ? 1u : 0u; + if (database_name.size() > start_symbol) + { + common_arguments.push_back("--database"); + common_arguments.push_back(start_symbol == 0u ? database_name : database_name.substr(start_symbol)); + } + } + catch (const Poco::URISyntaxException & invalid_uri_exception) + { + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, + "Invalid connection string {}: {}", connection_string, invalid_uri_exception.what()); + } + + return true; +} + +} diff --git a/src/Client/ConnectionString.h b/src/Client/ConnectionString.h new file mode 100644 index 00000000000..aafb1139b00 --- /dev/null +++ b/src/Client/ConnectionString.h @@ -0,0 +1,22 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ +/** Tries to parse ClickHouse connection string. + * if @connection_string starts with 'clickhouse:' then connection string will be parsed + * and converted into a set of arguments for the client. + * Connection string format is similar to URI "clickhouse:[//[user_info@][hosts_and_ports]][/dbname][?query_parameters]" + * with the difference that hosts_and_ports can contain multiple hosts separated by ','. + * example: clickhouse://user@host1:port1,host2:port2 + * @return returns true if there is a URI, false otherwise. + * @exception throws DB::Exception if URI has valid scheme (clickhouse:), but invalid internals. +*/ +bool tryParseConnectionString( + std::string_view connection_string, + std::vector & common_arguments, + std::vector> & hosts_and_ports_arguments); +} diff --git a/tests/queries/0_stateless/02784_connection_string.reference b/tests/queries/0_stateless/02784_connection_string.reference new file mode 100644 index 00000000000..6a36abae8e0 --- /dev/null +++ b/tests/queries/0_stateless/02784_connection_string.reference @@ -0,0 +1,125 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +500 +501 +502 +1000 +1001 +1002 +1003 +Bad arguments +Bad arguments +Bad arguments +Bad arguments +Bad arguments +Bad arguments +Bad argumentsuthentication failed +Authentication failed diff --git a/tests/queries/0_stateless/02784_connection_string.sh b/tests/queries/0_stateless/02784_connection_string.sh new file mode 100755 index 00000000000..fce93fdad74 --- /dev/null +++ b/tests/queries/0_stateless/02784_connection_string.sh @@ -0,0 +1,156 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +USER_INFOS=('default' '') +HOSTS_PORTS=("$CLICKHOUSE_HOST:$CLICKHOUSE_PORT_TCP" "$CLICKHOUSE_HOST" "$CLICKHOUSE_HOST:" ":$CLICKHOUSE_PORT_TCP" "127.0.0.1" "127.0.0.1:$CLICKHOUSE_PORT_TCP" "$CLICKHOUSE_HOST:$CLICKHOUSE_PORT_TCP,invalid_host:9000" "[0000:0000:0000:0000:0000:0000:0000:0001]" "[::1]" "[::1]:$CLICKHOUSE_PORT_TCP" "" ) +DATABASES=("$CLICKHOUSE_DATABASE" "") + +TEST_INDEX=0 + +function runClient() +{ + $CLICKHOUSE_CLIENT_BINARY "$@" -q "SELECT $TEST_INDEX" --log_comment 02766_connection_string.sh --send_logs_level=warning + ((++TEST_INDEX)) +} + +function testConnectionString() +{ + if [ "$database" == "" ]; then + runClient "clickhouse:$1" + runClient "clickhouse:$1/" + else + runClient "clickhouse:$1/$database" + fi +} + +function testConnectionWithUserName() +{ +if [ "$user_info" == "" ] && [ "$host_port" == "" ]; then + testConnectionString "//" + testConnectionString "" + else + testConnectionString "//$user_info@$host_port" + fi +} + +for user_info in "${USER_INFOS[@]}" +do + for host_port in "${HOSTS_PORTS[@]}" + do + for database in "${DATABASES[@]}" + do + testConnectionWithUserName + done + done +done + +# Specific user and password +TEST_INDEX=500 +TEST_USER_NAME="test_user_02771_$$" +TEST_USER_EMAIL_NAME="test_user_02771_$$@some_mail.com" +TEST_USER_EMAIL_NAME_ENCODED="test_user_02771_$$%40some_mail.com" + +TEST_USER_PASSWORD="zyx%$&abc" +# %, $, & percent encoded +TEST_USER_PASSWORD_ENCODED="zyx%25%24%26abc" + +$CLICKHOUSE_CLIENT -q "CREATE USER '$TEST_USER_NAME'" +$CLICKHOUSE_CLIENT -q "CREATE USER '$TEST_USER_EMAIL_NAME' IDENTIFIED WITH plaintext_password BY '$TEST_USER_PASSWORD'" + +runClient "clickhouse://$TEST_USER_NAME@$CLICKHOUSE_HOST/$CLICKHOUSE_DATABASE" +runClient "clickhouse://$TEST_USER_EMAIL_NAME_ENCODED:$TEST_USER_PASSWORD_ENCODED@$CLICKHOUSE_HOST/$CLICKHOUSE_DATABASE" + +$CLICKHOUSE_CLIENT -q "DROP USER '$TEST_USER_NAME'" +$CLICKHOUSE_CLIENT -q "DROP USER '$TEST_USER_EMAIL_NAME'" + +# Percent-encoded database in non-ascii symbols +UTF8_DATABASE="БазаДанных_$$" +UTF8_DATABASE_PERCENT_ENCODED="%D0%91%D0%B0%D0%B7%D0%B0%D0%94%D0%B0%D0%BD%D0%BD%D1%8B%D1%85_$$" +$CLICKHOUSE_CLIENT -q "CREATE DATABASE IF NOT EXISTS \`$UTF8_DATABASE\`" +runClient "clickhouse://default@$CLICKHOUSE_HOST/$UTF8_DATABASE_PERCENT_ENCODED" +$CLICKHOUSE_CLIENT -q "DROP DATABASE IF EXISTS \`$UTF8_DATABASE\`" + +# clickhouse-client extra options cases +TEST_INDEX=1000 + +runClient "clickhouse://$CLICKHOUSE_HOST/" --user 'default' +runClient "clickhouse://$CLICKHOUSE_HOST/default" --user 'default' +runClient "clickhouse:" --database "$CLICKHOUSE_DATABASE" + +# User 'default' and default host +runClient "clickhouse://default@" + +# Invalid URI cases +TEST_INDEX=10000 +runClient "clickhouse://default:@$CLICKHOUSE_HOST/" --user 'default' 2>&1 | grep -o 'Bad arguments' +runClient "clickhouse://default:pswrd@$CLICKHOUSE_HOST/" --user 'default' 2>&1 | grep -o 'Bad arguments' +runClient "clickhouse://default:pswrd@$CLICKHOUSE_HOST/" --password 'pswrd' 2>&1 | grep -o 'Bad arguments' +runClient "clickhouse:///$CLICKHOUSE_DATABASE" --database "$CLICKHOUSE_DATABASE" 2>&1 | grep -o 'Bad arguments' +runClient "clickhouse://$CLICKHOUSE_HOST/$CLICKHOUSE_DATABASE" --database "$CLICKHOUSE_DATABASE" 2>&1 | grep -o 'Bad arguments' +runClient "clickhouse://$CLICKHOUSE_HOST/$CLICKHOUSE_DATABASE?s" --database "$CLICKHOUSE_DATABASE" 2>&1 | grep -o 'Bad arguments' +runClient "clickhouse:/$CLICKHOUSE_DATABASE?s" --database "$CLICKHOUSE_DATABASE" 2>&1 | grep -o 'Bad arguments' + +runClient "http://" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "click_house:" 2>&1 | grep -o 'BAD_ARGUMENTS' + +TEST_INDEX=1000087 +# Using connection string prohibits to use --host and --port options +runClient "clickhouse://default:@$CLICKHOUSE_HOST/" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://default:@$CLICKHOUSE_HOST/" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://default:@$CLICKHOUSE_HOST:$CLICKHOUSE_PORT_TCP/" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://default:@$CLICKHOUSE_HOST:$CLICKHOUSE_PORT_TCP/" --port "$CLICKHOUSE_PORT_TCP" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://default:@$CLICKHOUSE_HOST:$CLICKHOUSE_PORT_TCP/" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://default:@$CLICKHOUSE_HOST:$CLICKHOUSE_PORT_TCP/" --host "$CLICKHOUSE_HOST" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://default:@$CLICKHOUSE_HOST:$CLICKHOUSE_PORT_TCP/" --port "$CLICKHOUSE_PORT_TCP" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://default:@$CLICKHOUSE_HOST/" --port "$CLICKHOUSE_PORT_TCP" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://$CLICKHOUSE_HOST/" --port "$CLICKHOUSE_PORT_TCP" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://:@$CLICKHOUSE_HOST/" --port "$CLICKHOUSE_PORT_TCP" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://$CLICKHOUSE_HOST/" --port "$CLICKHOUSE_PORT_TCP" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse:" --port "$CLICKHOUSE_PORT_TCP" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://" --port "$CLICKHOUSE_PORT_TCP" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse:///" --port "$CLICKHOUSE_PORT_TCP" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse:///?" --port "$CLICKHOUSE_PORT_TCP" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://:/?" --port "$CLICKHOUSE_PORT_TCP" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse:" --database "$CLICKHOUSE_DATABASE" --port "$CLICKHOUSE_PORT_TCP" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' + +# Space is used in connection string (This is prohibited). +runClient " clickhouse:" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse: " 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://host1 /" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://host1, host2/" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://host1 ,host2/" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://host1 host2/" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://host1/ database:" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://user :password@host1" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://user: password@host1" 2>&1 | grep -o 'BAD_ARGUMENTS' + +# Query is not first argument +runClient --multiline "clickhouse://default:@$CLICKHOUSE_HOST/" 2>&1 | grep -o 'BAD_ARGUMENTS' +# Query used as the first and the second argument of client +runClient "clickhouse://default:@$CLICKHOUSE_HOST/" "clickhouse://default:@$CLICKHOUSE_HOST/" 2>&1 | grep -o 'BAD_ARGUMENTS' + +# Invalid hosts +runClient "clickhouse://host1,,," 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://," 2>&1 | grep -o 'BAD_ARGUMENTS' + +# Invalid parameters +runClient "clickhouse:?invalid_parameter" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse:?invalid_parameter&secure" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse:?s&invalid_parameter" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse:?s&invalid_parameter=val" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse:?invalid_parameter=arg" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse:?invalid_parameter=arg&s" 2>&1 | grep -o 'BAD_ARGUMENTS' +# Several users prohibited +runClient "clickhouse://user1@localhost,default@localhost/" 2>&1 | grep -o 'BAD_ARGUMENTS' +# Using '@' in user name is prohibited. User name should be percent-encoded. +runClient "clickhouse://my_mail@email.com@host/" 2>&1 | grep -o 'BAD_ARGUMENTS' + +# Wrong input cases +TEST_INDEX=100000 +# Invalid user name +runClient "clickhouse://non_exist_user@$CLICKHOUSE_HOST:$CLICKHOUSE_PORT_TCP/" 2>&1 | grep -o 'Authentication failed' +# Invalid password +runClient "clickhouse://default:invalid_password@$CLICKHOUSE_HOST:$CLICKHOUSE_PORT_TCP/" 2>&1 | grep -o 'Authentication failed' From 17754bf6941aa0754db2bb2de5c7098f890c2898 Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Wed, 7 Jun 2023 05:59:13 +0000 Subject: [PATCH 0593/1072] minor changes in documentation --- docs/en/interfaces/cli.md | 4 ++-- docs/ru/interfaces/cli.md | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index 5255657ddfd..94f1fbf9e41 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -168,7 +168,7 @@ clickhouse://[user_info@][hosts_and_ports][/dbname][?query_parameters] where user_info is: ```user[:password]``` and hosts_and_ports is a list of values: ```[host][:port],[host][:port]``` Port is not mandatory. -and query_parameters is a list of parameter[=value]: ```param_name[=value]¶m_name[=value]...``` value may not be required for some of parameters. +and query_parameters is a list of parameter[=value]: ```param_name[=value]¶m_name[=value]...``` value may not be required for some of parameters. Parameter names are case sensitive. Allowed query_parameters keys: @@ -198,7 +198,7 @@ If host is not specified, the default host will be used (localhost). If port is not specified, the default port will be used (9000). If database is not specified, the default database will be used. -User, password, and database can be specified in the connection string either in --user command line option. +User, password, and database can be specified in the connection string either in --user, --password, --database command line options. The connection string must be specified in the first argument of clickhouse-client. The connection string can be combined with other [command-line-options](#command-line-options) except **--host(h)** and **--port**. diff --git a/docs/ru/interfaces/cli.md b/docs/ru/interfaces/cli.md index 06642800cc6..30cd9757ebb 100644 --- a/docs/ru/interfaces/cli.md +++ b/docs/ru/interfaces/cli.md @@ -120,7 +120,8 @@ clickhouse://[user_info@][hosts_and_ports][/dbname][?query_parameters] где user_info - это: ```user[:password]``` hosts_and_ports - это список значений: ```[host][:port],[host][:port]```. Port может быть не задан. -query_parameters - это список пар ключ[=значение]: ```param_name[=value]¶m_name[=value]...```. Значение может быть пустым +query_parameters - это список пар ключ[=значение]: ```param_name[=value]¶m_name[=value]...```. Значение может быть пустым. +Имена параметров чувствительны к регистру. Допустимые ключи query_parameters: From 4a0ccc25d21e9f333057a421bd8009d648df17ae Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Wed, 7 Jun 2023 06:15:10 +0000 Subject: [PATCH 0594/1072] Minor improvement --- src/Client/ConnectionString.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Client/ConnectionString.cpp b/src/Client/ConnectionString.cpp index a8b87726a65..7d76deb6238 100644 --- a/src/Client/ConnectionString.cpp +++ b/src/Client/ConnectionString.cpp @@ -61,7 +61,7 @@ void getHostAndPort( const char * connection_string_end) { // User info does not matter in sub URI - std::string uri_string = {CONNECTION_URI_SCHEME.begin(), CONNECTION_URI_SCHEME.end()}; + auto uri_string = std::string(CONNECTION_URI_SCHEME); if (host_begin != nullptr && host_begin != host_end) { uri_string.append("//"); From aaa4d0367e9d51cd5308a7d5a02fd8333e9e7bb1 Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Wed, 7 Jun 2023 06:29:14 +0000 Subject: [PATCH 0595/1072] Minor improvement for connection string --- src/Client/ConnectionString.cpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/Client/ConnectionString.cpp b/src/Client/ConnectionString.cpp index 7d76deb6238..aeb1c1dca02 100644 --- a/src/Client/ConnectionString.cpp +++ b/src/Client/ConnectionString.cpp @@ -55,21 +55,19 @@ void getHostAndPort(const Poco::URI & uri, std::vector> void getHostAndPort( Poco::URI & uri, std::vector> & hosts_and_ports_arguments, - const char * host_begin, - const char * host_end, - const char * right_part_start, - const char * connection_string_end) + std::string_view host_and_port, + std::string_view right_part) { // User info does not matter in sub URI auto uri_string = std::string(CONNECTION_URI_SCHEME); - if (host_begin != nullptr && host_begin != host_end) + if (!host_and_port.empty()) { uri_string.append("//"); - uri_string.append(host_begin, host_end); + uri_string.append(host_and_port); } // Right part from string includes '/database?[params]' - uri_string.append(right_part_start, connection_string_end); + uri_string.append(right_part); try { uri = Poco::URI(uri_string); @@ -147,7 +145,7 @@ bool tryParseConnectionString( { if (*it == ',') { - getHostAndPort(uri, hosts_and_ports_arguments, last_host_begin, it, hosts_end, connection_string.end()); + getHostAndPort(uri, hosts_and_ports_arguments, {last_host_begin, it}, {hosts_end, connection_string.end()}); last_host_begin = it + 1; } } @@ -159,7 +157,7 @@ bool tryParseConnectionString( getHostAndPort(uri, hosts_and_ports_arguments); } else - getHostAndPort(uri, hosts_and_ports_arguments, last_host_begin, hosts_end, hosts_end, connection_string.end()); + getHostAndPort(uri, hosts_and_ports_arguments, {last_host_begin, hosts_end}, {hosts_end, connection_string.end()}); Poco::URI::QueryParameters params = uri.getQueryParameters(); for (const auto & param : params) From 4050b637f16554421423d92c501d9790deb42394 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 7 Jun 2023 09:01:20 +0000 Subject: [PATCH 0596/1072] ALTER TABLE ADD INDEX: Add default GRANULARITY argument for secondary indexes - Related to #45451, which provides a default GRANULARITY when the skipping index is created in CREATE TABLE. --- docs/en/sql-reference/statements/alter/skipping-index.md | 2 +- src/Parsers/ParserCreateIndexQuery.cpp | 9 ++++++--- src/Parsers/ParserCreateQuery.cpp | 2 +- .../0_stateless/02534_default_granularity.reference | 1 + tests/queries/0_stateless/02534_default_granularity.sql | 7 ++++++- 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/statements/alter/skipping-index.md b/docs/en/sql-reference/statements/alter/skipping-index.md index 67af76986da..4194731d33a 100644 --- a/docs/en/sql-reference/statements/alter/skipping-index.md +++ b/docs/en/sql-reference/statements/alter/skipping-index.md @@ -10,7 +10,7 @@ sidebar_label: INDEX The following operations are available: -- `ALTER TABLE [db].table_name [ON CLUSTER cluster] ADD INDEX name expression TYPE type GRANULARITY value [FIRST|AFTER name]` - Adds index description to tables metadata. +- `ALTER TABLE [db].table_name [ON CLUSTER cluster] ADD INDEX name expression TYPE type [GRANULARITY value] [FIRST|AFTER name]` - Adds index description to tables metadata. - `ALTER TABLE [db].table_name [ON CLUSTER cluster] DROP INDEX name` - Removes index description from tables metadata and deletes index files from disk. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). diff --git a/src/Parsers/ParserCreateIndexQuery.cpp b/src/Parsers/ParserCreateIndexQuery.cpp index ab31d3f9b7a..e878b347e62 100644 --- a/src/Parsers/ParserCreateIndexQuery.cpp +++ b/src/Parsers/ParserCreateIndexQuery.cpp @@ -36,17 +36,20 @@ bool ParserCreateIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected if (!data_type_p.parse(pos, type, expected)) return false; - if (!s_granularity.ignore(pos, expected)) - return false; + if (s_granularity.ignore(pos, expected)) + { + if (!granularity_p.parse(pos, granularity, expected)) + return false; + } if (!granularity_p.parse(pos, granularity, expected)) return false; auto index = std::make_shared(); index->part_of_create_index_query = true; - index->granularity = granularity->as().value.safeGet(); index->set(index->expr, expr); index->set(index->type, type); + index->granularity = granularity ? granularity->as().value.safeGet() : 1; node = index; return true; diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index 8cbfac91465..f975e8ba3c8 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -139,9 +139,9 @@ bool ParserIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expe auto index = std::make_shared(); index->name = name->as().name(); - index->granularity = granularity ? granularity->as().value.safeGet() : 1; index->set(index->expr, expr); index->set(index->type, type); + index->granularity = granularity ? granularity->as().value.safeGet() : 1; node = index; return true; diff --git a/tests/queries/0_stateless/02534_default_granularity.reference b/tests/queries/0_stateless/02534_default_granularity.reference index e60036653c9..0fe7fe0a1b3 100644 --- a/tests/queries/0_stateless/02534_default_granularity.reference +++ b/tests/queries/0_stateless/02534_default_granularity.reference @@ -1 +1,2 @@ CREATE TABLE default.users_02534\n(\n `id` Int16,\n `name` String,\n INDEX bf_idx name TYPE minmax GRANULARITY 1\n)\nENGINE = MergeTree\nORDER BY id\nSETTINGS index_granularity = 8192 +CREATE TABLE default.users_02534\n(\n `id` Int16,\n `name` String,\n INDEX bf_idx name TYPE minmax GRANULARITY 1\n)\nENGINE = MergeTree\nORDER BY id\nSETTINGS index_granularity = 8192 diff --git a/tests/queries/0_stateless/02534_default_granularity.sql b/tests/queries/0_stateless/02534_default_granularity.sql index 781df3ce934..e3de5fce7c8 100644 --- a/tests/queries/0_stateless/02534_default_granularity.sql +++ b/tests/queries/0_stateless/02534_default_granularity.sql @@ -1,4 +1,9 @@ DROP TABLE IF EXISTS users_02534; CREATE TABLE users_02534 (id Int16, name String, INDEX bf_idx(name) TYPE minmax) ENGINE=MergeTree ORDER BY id; SHOW CREATE TABLE users_02534; -DROP TABLE users_02534; \ No newline at end of file +DROP TABLE users_02534; + +CREATE TABLE users_02534 (id Int16, name String) ENGINE=MergeTree ORDER BY id; +ALTER TABLE users_02534 ADD INDEX bf_idx(name) TYPE minmax; +SHOW CREATE TABLE users_02534; +DROP TABLE users_02534; From c795eb03299f751dd4a0c69facb2d5a6bec101da Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 7 Jun 2023 09:46:10 +0000 Subject: [PATCH 0597/1072] Temporarily disable a test --- tests/queries/0_stateless/02354_annoy_index.sql | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/queries/0_stateless/02354_annoy_index.sql b/tests/queries/0_stateless/02354_annoy_index.sql index 170c048d420..0168fa04c6f 100644 --- a/tests/queries/0_stateless/02354_annoy_index.sql +++ b/tests/queries/0_stateless/02354_annoy_index.sql @@ -20,11 +20,12 @@ FROM tab ORDER BY L2Distance(embedding, [0.0, 0.0, 10.0]) LIMIT 3; -SELECT 'Reference ARRAYs with non-matching dimension are rejected'; -SELECT * -FROM tab -ORDER BY L2Distance(embedding, [0.0, 0.0]) -LIMIT 3; -- { serverError INCORRECT_QUERY } +-- Produces different error code with analyzer, TODO: check +-- SELECT 'Reference ARRAYs with non-matching dimension are rejected'; +-- SELECT * +-- FROM tab +-- ORDER BY L2Distance(embedding, [0.0, 0.0]) +-- LIMIT 3; -- { serverError INCORRECT_QUERY } SELECT 'WHERE type, L2Distance, check that index is used'; EXPLAIN indexes=1 From 7c800468349e4aea2c125459f44b581d14391a10 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 7 Jun 2023 09:47:54 +0000 Subject: [PATCH 0598/1072] Revert "Remove clang-tidy exclude" This reverts commit 42c054789561920adf7ce4770968ba303a70f244. --- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index ffed9e01df0..1a28f28f746 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -123,6 +123,7 @@ MergeTreeIndexAggregatorAnnoy::MergeTreeIndexAggregatorAnnoy( template MergeTreeIndexGranulePtr MergeTreeIndexAggregatorAnnoy::getGranuleAndReset() { + // NOLINTNEXTLINE(*) index->build(static_cast(trees), /*number_of_threads=*/1); auto granule = std::make_shared>(index_name, index_sample_block, index); index = nullptr; From e0bc695e2d95085e1927b44fc2ad5a9d3384c1d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Wed, 7 Jun 2023 10:07:35 +0000 Subject: [PATCH 0599/1072] Use correct link format --- docs/en/sql-reference/aggregate-functions/index.md | 4 ++-- .../sql-reference/aggregate-functions/reference/argmax.md | 6 +++--- .../sql-reference/aggregate-functions/reference/argmin.md | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/index.md b/docs/en/sql-reference/aggregate-functions/index.md index ea270e83a3c..5d2229fbcce 100644 --- a/docs/en/sql-reference/aggregate-functions/index.md +++ b/docs/en/sql-reference/aggregate-functions/index.md @@ -4,7 +4,7 @@ sidebar_label: Aggregate Functions sidebar_position: 33 --- -# Aggregate Functions +# Aggregate Functions Aggregate functions work in the [normal](http://www.sql-tutorial.com/sql-aggregate-functions-sql-tutorial) way as expected by database experts. @@ -73,7 +73,7 @@ FROM t_null_big └────────────────────┴─────────────────────┘ ``` -Also you can use [Tuple](../data-types/tuple.md) to work around NULL skipping behavior. The a `Tuple` that contains only a `NULL` value is not `NULL`, so the aggregate functions won't skip that row because of that `NULL` value. +Also you can use [Tuple](/docs/en/sql-reference/data-types/tuple.md) to work around NULL skipping behavior. The a `Tuple` that contains only a `NULL` value is not `NULL`, so the aggregate functions won't skip that row because of that `NULL` value. ```sql SELECT diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmax.md b/docs/en/sql-reference/aggregate-functions/reference/argmax.md index 93e1fac6d67..8f10318838b 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmax.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmax.md @@ -5,8 +5,8 @@ sidebar_position: 106 # argMax -Calculates the `arg` value for a maximum `val` value. If there are several different values of `arg` for maximum values of `val`, returns the first of these values encountered. -Both parts the `arg` and the `max` behave as [aggregate functions](../index.md), they both [skip `Null`](../index.md#null-processing) during processing and return not `Null` values if not `Null` values are available. +Calculates the `arg` value for a maximum `val` value. If there are several different values of `arg` for maximum values of `val`, returns the first of these values encountered. +Both parts the `arg` and the `max` behave as [aggregate functions](/docs/en/sql-reference/aggregate-functions/index.md), they both [skip `Null`](/docs/en/sql-reference/aggregate-functions/index.md#null-processing) during processing and return not `Null` values if not `Null` values are available. **Syntax** @@ -106,4 +106,4 @@ SELECT argMax(a, tuple(b)) FROM test; **See also** -- [Tuple](../../data-types/tuple.md) +- [Tuple](/docs/en/sql-reference/data-types/tuple.md) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmin.md b/docs/en/sql-reference/aggregate-functions/reference/argmin.md index 4e549e5b04c..47d4ab398de 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmin.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmin.md @@ -6,7 +6,7 @@ sidebar_position: 105 # argMin Calculates the `arg` value for a minimum `val` value. If there are several different values of `arg` for minimum values of `val`, returns the first of these values encountered. -Both parts the `arg` and the `min` behave as [aggregate functions](../index.md), they both [skip `Null`](../index.md#null-processing) during processing and return not `Null` values if not `Null` values are available. +Both parts the `arg` and the `min` behave as [aggregate functions](/docs/en/sql-reference/aggregate-functions/index.md), they both [skip `Null`](/docs/en/sql-reference/aggregate-functions/index.md#null-processing) during processing and return not `Null` values if not `Null` values are available. **Syntax** @@ -111,4 +111,4 @@ select argMin(a, tuple(b)) from test; **See also** -- [Tuple](../../data-types/tuple.md) +- [Tuple](/docs/en/sql-reference/data-types/tuple.md) From 35ef14482d785226a660c62fea558fdb91a1d26f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Wed, 7 Jun 2023 10:11:49 +0000 Subject: [PATCH 0600/1072] Fix keyword capitalization --- docs/en/sql-reference/aggregate-functions/reference/argmin.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmin.md b/docs/en/sql-reference/aggregate-functions/reference/argmin.md index 47d4ab398de..fdfce0833e0 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmin.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmin.md @@ -103,9 +103,9 @@ SELECT argMin((a, b), (b, a)), min(tuple(b, a)) FROM test; │ (NULL,NULL) │ (NULL,NULL) │ -- argMin returns (NULL,NULL) here because `Tuple` allows to don't skip `NULL` and min(tuple(b, a)) in this case is minimal value for this dataset └──────────────────────────────────┴──────────────────┘ -select argMin(a, tuple(b)) from test; +SELECT argMin(a, tuple(b)) FROM test; ┌─argMax(a, tuple(b))─┐ -│ d │ -- `Tuple` can be used in `min` to not skip rows with `NULL` values as b. +│ d │ -- `Tuple` can be used in `min` to not skip rows with `NULL` values as b. └─────────────────────┘ ``` From 71ae54f089f51c396616842743e8ba0a7f38bc59 Mon Sep 17 00:00:00 2001 From: Alexander Sapin Date: Wed, 7 Jun 2023 13:34:01 +0200 Subject: [PATCH 0601/1072] Fix args --- .../ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 07173e65448..20c60cfe8f5 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -52,7 +52,6 @@ public: options.Prefix = path_prefix; options.PageSizeHint = static_cast(max_list_size); - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "ITER PREFIX {}", path_prefix); } private: @@ -63,7 +62,6 @@ private: auto blob_list_response = client->ListBlobs(options); auto blobs_list = blob_list_response.Blobs; - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "BLOB LIST SIZE {}", blobs_list.size()); for (const auto & blob : blobs_list) { batch.emplace_back( @@ -77,13 +75,9 @@ private: } if (!blob_list_response.NextPageToken.HasValue() || blob_list_response.NextPageToken.Value().empty()) - { - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "RETURN FALSE {}", blobs_list.size()); return false; - } options.ContinuationToken = blob_list_response.NextPageToken; - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "RETURN TRUE {}", blobs_list.size()); return true; } @@ -222,6 +216,7 @@ std::unique_ptr AzureObjectStorage::readObjects( /// NOL settings_ptr->max_single_read_retries, settings_ptr->max_single_download_retries, /* use_external_buffer */true, + /* restricted_seek */true, read_until_position); }; From 036ddcd47baf88ab0c360efe647e01060d1ce636 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 7 Jun 2023 13:48:08 +0200 Subject: [PATCH 0602/1072] Fix excessive memory usage for FINAL (due to too much streams usage) (#50429) Previously it could create MergeTreeInOrder for each mark, however this could be very suboptimal, due to each MergeTreeInOrder has some memory overhead. Now, by collapsing all marks for one part together it is more memory effiecient. I've tried the query from the altinity wiki [1] and it decreases memory usage twice: SELECT * FROM repl_tbl FINAL WHERE key IN (SELECT toUInt32(number) FROM numbers(1000000) WHERE number % 50000 = 0) FORMAT Null - upstream: MemoryTracker: Peak memory usage (for query): 520.27 MiB. - patched: MemoryTracker: Peak memory usage (for query): 260.95 MiB. [1]: https://kb.altinity.com/engines/mergetree-table-engine-family/replacingmergetree/#multiple-keys And it could be not 2x and even more or less, it depends on the gaps in marks for reading (for example in my setup the memory usage increased a lot, from ~16GiB of RAM to >64GiB due to lots of marks and gaps). Signed-off-by: Azat Khuzhin --- src/Processors/QueryPlan/PartsSplitter.cpp | 35 ++++++++++----- ...inal_streams_data_skipping_index.reference | 43 +++++++++++++++++++ ...2780_final_streams_data_skipping_index.sql | 28 ++++++++++++ 3 files changed, 95 insertions(+), 11 deletions(-) create mode 100644 tests/queries/0_stateless/02780_final_streams_data_skipping_index.reference create mode 100644 tests/queries/0_stateless/02780_final_streams_data_skipping_index.sql diff --git a/src/Processors/QueryPlan/PartsSplitter.cpp b/src/Processors/QueryPlan/PartsSplitter.cpp index 936182f8c00..9796e696f6c 100644 --- a/src/Processors/QueryPlan/PartsSplitter.cpp +++ b/src/Processors/QueryPlan/PartsSplitter.cpp @@ -126,7 +126,9 @@ std::pair, std::vector> split(RangesInDat return marks_in_current_layer < intersected_parts * 2; }; - result_layers.emplace_back(); + auto & current_layer = result_layers.emplace_back(); + /// Map part_idx into index inside layer, used to merge marks from the same part into one reader + std::unordered_map part_idx_in_layer; while (rows_in_current_layer < rows_per_layer || layers_intersection_is_too_big() || result_layers.size() == max_layers) { @@ -140,11 +142,16 @@ std::pair, std::vector> split(RangesInDat if (current.event == PartsRangesIterator::EventType::RangeEnd) { - result_layers.back().emplace_back( - parts[part_idx].data_part, - parts[part_idx].alter_conversions, - parts[part_idx].part_index_in_query, - MarkRanges{{current_part_range_begin[part_idx], current.range.end}}); + const auto & mark = MarkRange{current_part_range_begin[part_idx], current.range.end}; + auto it = part_idx_in_layer.emplace(std::make_pair(part_idx, current_layer.size())); + if (it.second) + current_layer.emplace_back( + parts[part_idx].data_part, + parts[part_idx].alter_conversions, + parts[part_idx].part_index_in_query, + MarkRanges{mark}); + else + current_layer[it.first->second].ranges.push_back(mark); current_part_range_begin.erase(part_idx); current_part_range_end.erase(part_idx); @@ -170,11 +177,17 @@ std::pair, std::vector> split(RangesInDat } for (const auto & [part_idx, last_mark] : current_part_range_end) { - result_layers.back().emplace_back( - parts[part_idx].data_part, - parts[part_idx].alter_conversions, - parts[part_idx].part_index_in_query, - MarkRanges{{current_part_range_begin[part_idx], last_mark + 1}}); + const auto & mark = MarkRange{current_part_range_begin[part_idx], last_mark + 1}; + auto it = part_idx_in_layer.emplace(std::make_pair(part_idx, current_layer.size())); + + if (it.second) + result_layers.back().emplace_back( + parts[part_idx].data_part, + parts[part_idx].alter_conversions, + parts[part_idx].part_index_in_query, + MarkRanges{mark}); + else + current_layer[it.first->second].ranges.push_back(mark); current_part_range_begin[part_idx] = current_part_range_end[part_idx]; } diff --git a/tests/queries/0_stateless/02780_final_streams_data_skipping_index.reference b/tests/queries/0_stateless/02780_final_streams_data_skipping_index.reference new file mode 100644 index 00000000000..d7a540ae479 --- /dev/null +++ b/tests/queries/0_stateless/02780_final_streams_data_skipping_index.reference @@ -0,0 +1,43 @@ +-- { echoOn } +EXPLAIN PIPELINE SELECT * FROM data FINAL WHERE v1 >= now() - INTERVAL 180 DAY +SETTINGS max_threads=2, max_final_threads=2, force_data_skipping_indices='v1_index', use_skip_indexes_if_final=1 +FORMAT LineAsString; +(Expression) +ExpressionTransform × 2 + (Filter) + FilterTransform × 2 + (ReadFromMergeTree) + ExpressionTransform × 2 + AggregatingSortedTransform 2 → 1 + ExpressionTransform × 2 + FilterSortedStreamByRange × 2 + Description: filter values in [(999424), +inf) + ExpressionTransform × 2 + MergeTreeInOrder × 2 0 → 1 + AggregatingSortedTransform + ExpressionTransform + FilterSortedStreamByRange + Description: filter values in [-inf, (999424)) + ExpressionTransform + MergeTreeInOrder 0 → 1 +EXPLAIN PIPELINE SELECT * FROM data FINAL WHERE v1 >= now() - INTERVAL 180 DAY +SETTINGS max_threads=2, max_final_threads=2, force_data_skipping_indices='v1_index', use_skip_indexes_if_final=0 +FORMAT LineAsString; +(Expression) +ExpressionTransform × 2 + (Filter) + FilterTransform × 2 + (ReadFromMergeTree) + ExpressionTransform × 2 + AggregatingSortedTransform 2 → 1 + ExpressionTransform × 2 + FilterSortedStreamByRange × 2 + Description: filter values in [(999424), +inf) + ExpressionTransform × 2 + MergeTreeInOrder × 2 0 → 1 + AggregatingSortedTransform + ExpressionTransform + FilterSortedStreamByRange + Description: filter values in [-inf, (999424)) + ExpressionTransform + MergeTreeInOrder 0 → 1 diff --git a/tests/queries/0_stateless/02780_final_streams_data_skipping_index.sql b/tests/queries/0_stateless/02780_final_streams_data_skipping_index.sql new file mode 100644 index 00000000000..7de7a58e2e1 --- /dev/null +++ b/tests/queries/0_stateless/02780_final_streams_data_skipping_index.sql @@ -0,0 +1,28 @@ +-- Tags: no-random-merge-tree-settings, no-random-settings + +DROP TABLE IF EXISTS data; + +CREATE TABLE data +( + key Int, + v1 DateTime, + INDEX v1_index v1 TYPE minmax GRANULARITY 1 +) ENGINE=AggregatingMergeTree() +ORDER BY key +SETTINGS index_granularity=8192, min_bytes_for_wide_part=0, min_rows_for_wide_part=0; + +SYSTEM STOP MERGES data; + +-- generate 50% of marks that cannot be skipped with v1_index +-- this will create a gap in marks +INSERT INTO data SELECT number, if(number/8192 % 2 == 0, now(), now() - INTERVAL 200 DAY) FROM numbers(1e6); +INSERT INTO data SELECT number+1e6, if(number/8192 % 2 == 0, now(), now() - INTERVAL 200 DAY) FROM numbers(1e6); + +-- { echoOn } +EXPLAIN PIPELINE SELECT * FROM data FINAL WHERE v1 >= now() - INTERVAL 180 DAY +SETTINGS max_threads=2, max_final_threads=2, force_data_skipping_indices='v1_index', use_skip_indexes_if_final=1 +FORMAT LineAsString; + +EXPLAIN PIPELINE SELECT * FROM data FINAL WHERE v1 >= now() - INTERVAL 180 DAY +SETTINGS max_threads=2, max_final_threads=2, force_data_skipping_indices='v1_index', use_skip_indexes_if_final=0 +FORMAT LineAsString; From 81cd3defd79331fc0af016c4c40a957be15a227a Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 7 Jun 2023 12:29:09 +0000 Subject: [PATCH 0603/1072] Fix expected results --- tests/queries/0_stateless/02354_annoy_index.reference | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/queries/0_stateless/02354_annoy_index.reference b/tests/queries/0_stateless/02354_annoy_index.reference index 5bd1377d6f4..45515bc7733 100644 --- a/tests/queries/0_stateless/02354_annoy_index.reference +++ b/tests/queries/0_stateless/02354_annoy_index.reference @@ -9,7 +9,6 @@ ORDER BY type, L2Distance 1 [0,0,10] 5 [0,0,10.2] 4 [0,0,9.7] -Reference ARRAYs with non-matching dimension are rejected WHERE type, L2Distance, check that index is used Expression ((Projection + Before ORDER BY)) Limit (preliminary LIMIT (without OFFSET)) From 87ac6b8b637e9931c32fffb8a273101295c161ba Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 7 Jun 2023 12:49:28 +0000 Subject: [PATCH 0604/1072] Fix reading negative decimals in avro format --- .../Formats/Impl/AvroRowInputFormat.cpp | 10 ++++++++-- .../0_stateless/02782_avro_decimals.reference | 12 ++++++++++++ .../queries/0_stateless/data_avro/decimals.avro | Bin 295 -> 353 bytes 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 60e541a0109..c8e4a499b81 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -184,8 +184,14 @@ static AvroDeserializer::DeserializeFn createDecimalDeserializeFn(const avro::No field_type_size, tmp.size()); else if (tmp.size() != field_type_size) - /// Add padding with 0-bytes. - tmp = std::string(field_type_size - tmp.size(), '\0') + tmp; + { + /// Extent value to required size by adding paddinf. + /// Check if value is negative or positive. + if (tmp[0] & 128) + tmp = std::string(field_type_size - tmp.size(), 0xff) + tmp; + else + tmp = std::string(field_type_size - tmp.size(), 0) + tmp; + } typename DecimalType::FieldType field; ReadBufferFromString buf(tmp); diff --git a/tests/queries/0_stateless/02782_avro_decimals.reference b/tests/queries/0_stateless/02782_avro_decimals.reference index ed46f1c3758..a16e5e4ac56 100644 --- a/tests/queries/0_stateless/02782_avro_decimals.reference +++ b/tests/queries/0_stateless/02782_avro_decimals.reference @@ -11,3 +11,15 @@ d Decimal(14, 4) 12345678.1234 123456789.1234 1234567890.1234 +0 +-1 +-1.1 +-12.12 +-123.123 +-1234.1234 +-12345.1234 +-123456.1234 +-1234567.1234 +-12345678.1234 +-123456789.1234 +-1234567890.1234 diff --git a/tests/queries/0_stateless/data_avro/decimals.avro b/tests/queries/0_stateless/data_avro/decimals.avro index 5c29ac235d59003696735c8c8092eed5bcce41b2..630d6864047e9638da9dc1437780c41bd820ca12 100644 GIT binary patch delta 103 zcmZ3^^pI)78DSNbwU@s*w@gW!@aK?AzgoA!6UK@6EIpYRSZ;h^xyr%zPi+F*TeBvP w|IQouIDRbgn8&d#OI(lZuYwBWKCZ`(=`4Oc{~q+Qe7nc<$6Jnr7T_unL4xsVKrqKWq{1<*wS0^Syk From cf947e6e01036d51ef0337378b12e868a07ecce2 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 7 Jun 2023 12:50:16 +0000 Subject: [PATCH 0605/1072] Fix typo --- src/Processors/Formats/Impl/AvroRowInputFormat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index c8e4a499b81..1ec7491658e 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -185,7 +185,7 @@ static AvroDeserializer::DeserializeFn createDecimalDeserializeFn(const avro::No tmp.size()); else if (tmp.size() != field_type_size) { - /// Extent value to required size by adding paddinf. + /// Extent value to required size by adding padding. /// Check if value is negative or positive. if (tmp[0] & 128) tmp = std::string(field_type_size - tmp.size(), 0xff) + tmp; From db1c03d6db270a4a6b059d4b7f09c5d264f13081 Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 2 Jun 2023 15:54:39 +0000 Subject: [PATCH 0606/1072] Cleanup moving parts --- src/Storages/MergeTree/MergeTreeData.cpp | 26 +++++++++----- src/Storages/MergeTree/MergeTreeData.h | 1 + .../MergeTree/MergeTreePartsMover.cpp | 36 ++++++++++++++----- src/Storages/MergeTree/MergeTreePartsMover.h | 10 +++++- 4 files changed, 55 insertions(+), 18 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index e806e1bb93f..047f063cb7c 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1998,8 +1998,18 @@ static bool isOldPartDirectory(const DiskPtr & disk, const String & directory_pa return true; } - size_t MergeTreeData::clearOldTemporaryDirectories(size_t custom_directories_lifetime_seconds, const NameSet & valid_prefixes) +{ + size_t cleared_count = 0; + + cleared_count += clearOldTemporaryDirectories(relative_data_path, custom_directories_lifetime_seconds, valid_prefixes); + + /// Clear _all_ parts from the `moving` directory + cleared_count += clearOldTemporaryDirectories(fs::path(relative_data_path) / "moving", custom_directories_lifetime_seconds, {""}); + return cleared_count; +} + +size_t MergeTreeData::clearOldTemporaryDirectories(const String & root_path, size_t custom_directories_lifetime_seconds, const NameSet & valid_prefixes) { /// If the method is already called from another thread, then we don't need to do anything. std::unique_lock lock(clear_old_temporary_directories_mutex, std::defer_lock); @@ -2018,7 +2028,7 @@ size_t MergeTreeData::clearOldTemporaryDirectories(size_t custom_directories_lif if (disk->isBroken()) continue; - for (auto it = disk->iterateDirectory(relative_data_path); it->isValid(); it->next()) + for (auto it = disk->iterateDirectory(root_path); it->isValid(); it->next()) { const std::string & basename = it->name(); bool start_with_valid_prefix = false; @@ -7802,7 +7812,7 @@ MovePartsOutcome MergeTreeData::moveParts(const CurrentlyMovingPartsTaggerPtr & for (const auto & moving_part : moving_tagger->parts_to_move) { Stopwatch stopwatch; - MutableDataPartPtr cloned_part; + MergeTreePartsMover::TemporaryClonedPart cloned_part; ProfileEventsScope profile_events_scope; auto write_part_log = [&](const ExecutionStatus & execution_status) @@ -7812,7 +7822,7 @@ MovePartsOutcome MergeTreeData::moveParts(const CurrentlyMovingPartsTaggerPtr & execution_status, stopwatch.elapsed(), moving_part.part->name, - cloned_part, + cloned_part.part, {moving_part.part}, nullptr, profile_events_scope.getSnapshot()); @@ -7854,7 +7864,7 @@ MovePartsOutcome MergeTreeData::moveParts(const CurrentlyMovingPartsTaggerPtr & if (lock->isLocked()) { cloned_part = parts_mover.clonePart(moving_part); - parts_mover.swapClonedPart(cloned_part); + parts_mover.swapClonedPart(cloned_part.part); break; } else if (wait_for_move_if_zero_copy) @@ -7881,15 +7891,15 @@ MovePartsOutcome MergeTreeData::moveParts(const CurrentlyMovingPartsTaggerPtr & else /// Ordinary move as it should be { cloned_part = parts_mover.clonePart(moving_part); - parts_mover.swapClonedPart(cloned_part); + parts_mover.swapClonedPart(cloned_part.part); } write_part_log({}); } catch (...) { write_part_log(ExecutionStatus::fromCurrentException("", true)); - if (cloned_part) - cloned_part->remove(); + if (cloned_part.part) + cloned_part.part->remove(); throw; } diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 2f254f9a787..444bd8f47ac 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -676,6 +676,7 @@ public: /// Delete all directories which names begin with "tmp" /// Must be called with locked lockForShare() because it's using relative_data_path. size_t clearOldTemporaryDirectories(size_t custom_directories_lifetime_seconds, const NameSet & valid_prefixes = {"tmp_", "tmp-fetch_"}); + size_t clearOldTemporaryDirectories(const String & root_path, size_t custom_directories_lifetime_seconds, const NameSet & valid_prefixes); size_t clearEmptyParts(); diff --git a/src/Storages/MergeTree/MergeTreePartsMover.cpp b/src/Storages/MergeTree/MergeTreePartsMover.cpp index e1da57744b3..08815fa1f0c 100644 --- a/src/Storages/MergeTree/MergeTreePartsMover.cpp +++ b/src/Storages/MergeTree/MergeTreePartsMover.cpp @@ -11,6 +11,7 @@ namespace DB namespace ErrorCodes { extern const int ABORTED; + extern const int DIRECTORY_ALREADY_EXISTS; } namespace @@ -203,7 +204,7 @@ bool MergeTreePartsMover::selectPartsForMove( return false; } -MergeTreeMutableDataPartPtr MergeTreePartsMover::clonePart(const MergeTreeMoveEntry & moving_part) const +MergeTreePartsMover::TemporaryClonedPart MergeTreePartsMover::clonePart(const MergeTreeMoveEntry & moving_part) const { if (moves_blocker.isCancelled()) throw Exception(ErrorCodes::ABORTED, "Cancelled moving parts."); @@ -222,8 +223,10 @@ MergeTreeMutableDataPartPtr MergeTreePartsMover::clonePart(const MergeTreeMoveEn String relative_path = part->getDataPartStorage().getPartDirectory(); if (disk->exists(path_to_clone + relative_path)) { - LOG_WARNING(log, "Path {} already exists. Will remove it and clone again.", fullPath(disk, path_to_clone + relative_path)); - disk->removeRecursive(fs::path(path_to_clone) / relative_path / ""); + throw Exception(ErrorCodes::DIRECTORY_ALREADY_EXISTS, + "Cannot clone part {} from '{}' to '{}': path '{}' already exists", + part->name, part->getDataPartStorage().getDiskName(), disk->getName(), + fullPath(disk, path_to_clone + relative_path)); } disk->createDirectories(path_to_clone); @@ -240,14 +243,22 @@ MergeTreeMutableDataPartPtr MergeTreePartsMover::clonePart(const MergeTreeMoveEn { cloned_part_storage = part->makeCloneOnDisk(disk, MergeTreeData::MOVING_DIR_NAME); } + String data_part_directory = cloned_part_storage->getFullPath(); + + TemporaryClonedPart cloned_part; + cloned_part.temporary_directory_lock = data->getTemporaryPartDirectoryHolder(data_part_directory); MergeTreeDataPartBuilder builder(*data, part->name, cloned_part_storage); - auto cloned_part = std::move(builder).withPartFormatFromDisk().build(); - LOG_TRACE(log, "Part {} was cloned to {}", part->name, cloned_part->getDataPartStorage().getFullPath()); + cloned_part.part = std::move(builder).withPartFormatFromDisk().build(); - cloned_part->loadColumnsChecksumsIndexes(true, true); - cloned_part->loadVersionMetadata(); - cloned_part->modification_time = cloned_part->getDataPartStorage().getLastModified().epochTime(); + String part_directory = cloned_part.part->getDataPartStorage().getFullPath(); + + LOG_TRACE(log, "Part {} was cloned to {}", part->name, data_part_directory); + + cloned_part.part->loadColumnsChecksumsIndexes(true, true); + cloned_part.part->loadVersionMetadata(); + cloned_part.part->modification_time = cloned_part.part->getDataPartStorage().getLastModified().epochTime(); + cloned_part.part->is_temp = true; return cloned_part; } @@ -262,10 +273,17 @@ void MergeTreePartsMover::swapClonedPart(const MergeTreeMutableDataPartPtr & clo /// It's ok, because we don't block moving parts for merges or mutations if (!active_part || active_part->name != cloned_part->name) { - LOG_INFO(log, "Failed to swap {}. Active part doesn't exist. Possible it was merged or mutated. Will remove copy on path '{}'.", cloned_part->name, cloned_part->getDataPartStorage().getFullPath()); + LOG_INFO(log, + "Failed to swap {}. Active part doesn't exist (containing part {}). " + "Possible it was merged or mutated. Will remove copy on path '{}'", + cloned_part->name, + active_part ? active_part->name : "doesn't exist", + cloned_part->getDataPartStorage().getFullPath()); return; } + cloned_part->is_temp = false; + /// Don't remove new directory but throw an error because it may contain part which is currently in use. cloned_part->renameTo(active_part->name, false); diff --git a/src/Storages/MergeTree/MergeTreePartsMover.h b/src/Storages/MergeTree/MergeTreePartsMover.h index 1cee98bcba9..dde2ff1a630 100644 --- a/src/Storages/MergeTree/MergeTreePartsMover.h +++ b/src/Storages/MergeTree/MergeTreePartsMover.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -43,12 +44,19 @@ private: using AllowedMovingPredicate = std::function &, String * reason)>; public: + explicit MergeTreePartsMover(MergeTreeData * data_) : data(data_) , log(&Poco::Logger::get("MergeTreePartsMover")) { } + struct TemporaryClonedPart + { + MergeTreeMutableDataPartPtr part; + scope_guard temporary_directory_lock; + }; + /// Select parts for background moves according to storage_policy configuration. /// Returns true if at least one part was selected for move. bool selectPartsForMove( @@ -57,7 +65,7 @@ public: const std::lock_guard & moving_parts_lock); /// Copies part to selected reservation in detached folder. Throws exception if part already exists. - MergeTreeMutableDataPartPtr clonePart(const MergeTreeMoveEntry & moving_part) const; + TemporaryClonedPart clonePart(const MergeTreeMoveEntry & moving_part) const; /// Replaces cloned part from detached directory into active data parts set. /// Replacing part changes state to DeleteOnDestroy and will be removed from disk after destructor of From 19bb802b04cb9c31585b24f6ebe4765f0dc46c87 Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 2 Jun 2023 17:41:16 +0000 Subject: [PATCH 0607/1072] Set temporary_directories_lifetime to integration tests with MOVE --- .../test_consistant_parts_after_move_partition/test.py | 2 +- tests/integration/test_encrypted_disk/test.py | 2 +- tests/integration/test_merge_tree_azure_blob_storage/test.py | 1 + tests/integration/test_merge_tree_hdfs/test.py | 3 ++- tests/integration/test_merge_tree_s3/test.py | 1 + .../test_move_partition_to_disk_on_cluster/test.py | 2 +- .../configs/config.d/storage_configuration.xml | 4 ++++ .../configs/config.d/storage_configuration.xml | 3 ++- .../test_replicated_merge_tree_hdfs_zero_copy/test.py | 2 +- tests/integration/test_s3_zero_copy_replication/test.py | 2 +- tests/integration/test_ttl_move/test.py | 2 +- tests/integration/test_zero_copy_fetch/test.py | 2 +- 12 files changed, 17 insertions(+), 9 deletions(-) diff --git a/tests/integration/test_consistant_parts_after_move_partition/test.py b/tests/integration/test_consistant_parts_after_move_partition/test.py index 63a51472773..0b19e194e0e 100644 --- a/tests/integration/test_consistant_parts_after_move_partition/test.py +++ b/tests/integration/test_consistant_parts_after_move_partition/test.py @@ -18,7 +18,7 @@ def initialize_database(nodes, shard): CREATE TABLE `{database}`.dest (p UInt64, d UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/tables/test_consistent_shard2{shard}/replicated', '{replica}') ORDER BY d PARTITION BY p - SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0; + SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0, temporary_directories_lifetime=1; """.format( shard=shard, replica=node.name, database=CLICKHOUSE_DATABASE ) diff --git a/tests/integration/test_encrypted_disk/test.py b/tests/integration/test_encrypted_disk/test.py index 9f5415f4bea..fbf2b59785b 100644 --- a/tests/integration/test_encrypted_disk/test.py +++ b/tests/integration/test_encrypted_disk/test.py @@ -96,7 +96,7 @@ def test_part_move(policy, destination_disks): data String ) ENGINE=MergeTree() ORDER BY id - SETTINGS storage_policy='{}' + SETTINGS storage_policy='{}', temporary_directories_lifetime=1 """.format( policy ) diff --git a/tests/integration/test_merge_tree_azure_blob_storage/test.py b/tests/integration/test_merge_tree_azure_blob_storage/test.py index 8bf4df17c39..761b5257a34 100644 --- a/tests/integration/test_merge_tree_azure_blob_storage/test.py +++ b/tests/integration/test_merge_tree_azure_blob_storage/test.py @@ -66,6 +66,7 @@ def create_table(node, table_name, **additional_settings): "storage_policy": "blob_storage_policy", "old_parts_lifetime": 1, "index_granularity": 512, + "temporary_directories_lifetime": 1, } settings.update(additional_settings) diff --git a/tests/integration/test_merge_tree_hdfs/test.py b/tests/integration/test_merge_tree_hdfs/test.py index c79986c34f0..d1a145c00c1 100644 --- a/tests/integration/test_merge_tree_hdfs/test.py +++ b/tests/integration/test_merge_tree_hdfs/test.py @@ -29,7 +29,8 @@ def create_table(cluster, table_name, additional_settings=None): SETTINGS storage_policy='hdfs', old_parts_lifetime=0, - index_granularity=512 + index_granularity=512, + temporary_directories_lifetime=1 """.format( table_name ) diff --git a/tests/integration/test_merge_tree_s3/test.py b/tests/integration/test_merge_tree_s3/test.py index 2ccd517923a..3ab31f4728b 100644 --- a/tests/integration/test_merge_tree_s3/test.py +++ b/tests/integration/test_merge_tree_s3/test.py @@ -75,6 +75,7 @@ def create_table(node, table_name, **additional_settings): "storage_policy": "s3", "old_parts_lifetime": 0, "index_granularity": 512, + "temporary_directories_lifetime": 1, } settings.update(additional_settings) diff --git a/tests/integration/test_move_partition_to_disk_on_cluster/test.py b/tests/integration/test_move_partition_to_disk_on_cluster/test.py index 90753fc8ce3..c639e080cdf 100644 --- a/tests/integration/test_move_partition_to_disk_on_cluster/test.py +++ b/tests/integration/test_move_partition_to_disk_on_cluster/test.py @@ -46,7 +46,7 @@ def test_move_partition_to_disk_on_cluster(start_cluster): "(x UInt64) " "ENGINE=ReplicatedMergeTree('/clickhouse/tables/test_local_table', '{replica}') " "ORDER BY tuple()" - "SETTINGS storage_policy = 'jbod_with_external';", + "SETTINGS storage_policy = 'jbod_with_external', temporary_directories_lifetime=1;", ) node1.query("INSERT INTO test_local_table VALUES (0)") diff --git a/tests/integration/test_multiple_disks/configs/config.d/storage_configuration.xml b/tests/integration/test_multiple_disks/configs/config.d/storage_configuration.xml index ef40bfb0a0e..e7a87fb77b1 100644 --- a/tests/integration/test_multiple_disks/configs/config.d/storage_configuration.xml +++ b/tests/integration/test_multiple_disks/configs/config.d/storage_configuration.xml @@ -123,4 +123,8 @@
+ + 1 + + diff --git a/tests/integration/test_rename_column/configs/config.d/storage_configuration.xml b/tests/integration/test_rename_column/configs/config.d/storage_configuration.xml index da297e40037..12a598c64b5 100644 --- a/tests/integration/test_rename_column/configs/config.d/storage_configuration.xml +++ b/tests/integration/test_rename_column/configs/config.d/storage_configuration.xml @@ -24,9 +24,10 @@ - + 0 + 1 diff --git a/tests/integration/test_replicated_merge_tree_hdfs_zero_copy/test.py b/tests/integration/test_replicated_merge_tree_hdfs_zero_copy/test.py index bd1c890950a..eb3d62eb718 100644 --- a/tests/integration/test_replicated_merge_tree_hdfs_zero_copy/test.py +++ b/tests/integration/test_replicated_merge_tree_hdfs_zero_copy/test.py @@ -128,7 +128,7 @@ def test_hdfs_zero_copy_replication_single_move(cluster, storage_policy, init_ob CREATE TABLE single_node_move_test (dt DateTime, id Int64) ENGINE=ReplicatedMergeTree('/clickhouse/tables/{cluster}/{shard}/single_node_move_test', '{replica}') ORDER BY (dt, id) - SETTINGS storage_policy='$policy' + SETTINGS storage_policy='$policy',temporary_directories_lifetime=1 """ ).substitute(policy=storage_policy) ) diff --git a/tests/integration/test_s3_zero_copy_replication/test.py b/tests/integration/test_s3_zero_copy_replication/test.py index 100f062de2f..bc13c127610 100644 --- a/tests/integration/test_s3_zero_copy_replication/test.py +++ b/tests/integration/test_s3_zero_copy_replication/test.py @@ -163,7 +163,7 @@ def test_s3_zero_copy_on_hybrid_storage(started_cluster): CREATE TABLE hybrid_test ON CLUSTER test_cluster (id UInt32, value String) ENGINE=ReplicatedMergeTree('/clickhouse/tables/hybrid_test', '{}') ORDER BY id - SETTINGS storage_policy='hybrid' + SETTINGS storage_policy='hybrid',temporary_directories_lifetime=1 """.format( "{replica}" ) diff --git a/tests/integration/test_ttl_move/test.py b/tests/integration/test_ttl_move/test.py index 7635d784fef..a2f28e21666 100644 --- a/tests/integration/test_ttl_move/test.py +++ b/tests/integration/test_ttl_move/test.py @@ -1549,7 +1549,7 @@ def test_double_move_while_select(started_cluster, name, positive): ) ENGINE = MergeTree ORDER BY tuple() PARTITION BY n - SETTINGS storage_policy='small_jbod_with_external' + SETTINGS storage_policy='small_jbod_with_external',temporary_directories_lifetime=1 """.format( name=name ) diff --git a/tests/integration/test_zero_copy_fetch/test.py b/tests/integration/test_zero_copy_fetch/test.py index 9b9aa5e0da7..4f3d42096c3 100644 --- a/tests/integration/test_zero_copy_fetch/test.py +++ b/tests/integration/test_zero_copy_fetch/test.py @@ -45,7 +45,7 @@ CREATE TABLE test1 (EventDate Date, CounterID UInt32) ENGINE = ReplicatedMergeTree('/clickhouse-tables/test1', 'r1') PARTITION BY toMonday(EventDate) ORDER BY (CounterID, EventDate) -SETTINGS index_granularity = 8192, storage_policy = 's3'""" +SETTINGS index_granularity = 8192, storage_policy = 's3', temporary_directories_lifetime=1""" ) node1.query( From 09fecace434aaeb1c54049f94a855a2843766145 Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 2 Jun 2023 17:43:08 +0000 Subject: [PATCH 0608/1072] upd --- src/Storages/MergeTree/MergeTreeData.cpp | 3 --- src/Storages/MergeTree/MergeTreePartsMover.cpp | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 047f063cb7c..0e542aa3407 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -7898,9 +7898,6 @@ MovePartsOutcome MergeTreeData::moveParts(const CurrentlyMovingPartsTaggerPtr & catch (...) { write_part_log(ExecutionStatus::fromCurrentException("", true)); - if (cloned_part.part) - cloned_part.part->remove(); - throw; } } diff --git a/src/Storages/MergeTree/MergeTreePartsMover.cpp b/src/Storages/MergeTree/MergeTreePartsMover.cpp index 08815fa1f0c..2c3b3d1a621 100644 --- a/src/Storages/MergeTree/MergeTreePartsMover.cpp +++ b/src/Storages/MergeTree/MergeTreePartsMover.cpp @@ -255,10 +255,10 @@ MergeTreePartsMover::TemporaryClonedPart MergeTreePartsMover::clonePart(const Me LOG_TRACE(log, "Part {} was cloned to {}", part->name, data_part_directory); + cloned_part.part->is_temp = true; cloned_part.part->loadColumnsChecksumsIndexes(true, true); cloned_part.part->loadVersionMetadata(); cloned_part.part->modification_time = cloned_part.part->getDataPartStorage().getLastModified().epochTime(); - cloned_part.part->is_temp = true; return cloned_part; } From 80f918d4b77cbad22aeb0371ac2f7881fe603550 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 5 Jun 2023 18:22:41 +0000 Subject: [PATCH 0609/1072] Fixes for cleanup moving parts --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 13 ++++++-- src/Storages/MergeTree/MergeTreeData.cpp | 4 +-- .../MergeTree/MergeTreePartsMover.cpp | 31 +++++++++---------- src/Storages/MergeTree/MergeTreePartsMover.h | 2 +- 4 files changed, 28 insertions(+), 22 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index d27b03fff44..9084b5790af 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -492,13 +492,17 @@ void IMergeTreeDataPart::removeIfNeeded() if (is_temp) { - String file_name = fileName(getDataPartStorage().getPartDirectory()); + const auto & part_directory = getDataPartStorage().getPartDirectory(); + + String file_name = fileName(part_directory); if (file_name.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "relative_path {} of part {} is invalid or not set", getDataPartStorage().getPartDirectory(), name); - if (!startsWith(file_name, "tmp") && !endsWith(file_name, ".tmp_proj")) + const auto part_parent_directory = directoryPath(part_directory); + bool is_moving_part = part_parent_directory.ends_with("moving/"); + if (!startsWith(file_name, "tmp") && !endsWith(file_name, ".tmp_proj") && !is_moving_part) { LOG_ERROR( storage.log, @@ -507,6 +511,11 @@ void IMergeTreeDataPart::removeIfNeeded() path); return; } + + if (is_moving_part) + { + LOG_TRACE(storage.log, "Removing unneeded moved part from {}", path); + } } remove(); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 0e542aa3407..7fe3efaf6d5 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -7864,7 +7864,7 @@ MovePartsOutcome MergeTreeData::moveParts(const CurrentlyMovingPartsTaggerPtr & if (lock->isLocked()) { cloned_part = parts_mover.clonePart(moving_part); - parts_mover.swapClonedPart(cloned_part.part); + parts_mover.swapClonedPart(cloned_part); break; } else if (wait_for_move_if_zero_copy) @@ -7891,7 +7891,7 @@ MovePartsOutcome MergeTreeData::moveParts(const CurrentlyMovingPartsTaggerPtr & else /// Ordinary move as it should be { cloned_part = parts_mover.clonePart(moving_part); - parts_mover.swapClonedPart(cloned_part.part); + parts_mover.swapClonedPart(cloned_part); } write_part_log({}); } diff --git a/src/Storages/MergeTree/MergeTreePartsMover.cpp b/src/Storages/MergeTree/MergeTreePartsMover.cpp index 2c3b3d1a621..656167de986 100644 --- a/src/Storages/MergeTree/MergeTreePartsMover.cpp +++ b/src/Storages/MergeTree/MergeTreePartsMover.cpp @@ -213,6 +213,8 @@ MergeTreePartsMover::TemporaryClonedPart MergeTreePartsMover::clonePart(const Me auto part = moving_part.part; auto disk = moving_part.reserved_space->getDisk(); LOG_DEBUG(log, "Cloning part {} from '{}' to '{}'", part->name, part->getDataPartStorage().getDiskName(), disk->getName()); + TemporaryClonedPart cloned_part; + cloned_part.temporary_directory_lock = data->getTemporaryPartDirectoryHolder(part->name); MutableDataPartStoragePtr cloned_part_storage; if (disk->supportZeroCopyReplication() && settings->allow_remote_fs_zero_copy_replication) @@ -243,17 +245,10 @@ MergeTreePartsMover::TemporaryClonedPart MergeTreePartsMover::clonePart(const Me { cloned_part_storage = part->makeCloneOnDisk(disk, MergeTreeData::MOVING_DIR_NAME); } - String data_part_directory = cloned_part_storage->getFullPath(); - - TemporaryClonedPart cloned_part; - cloned_part.temporary_directory_lock = data->getTemporaryPartDirectoryHolder(data_part_directory); MergeTreeDataPartBuilder builder(*data, part->name, cloned_part_storage); cloned_part.part = std::move(builder).withPartFormatFromDisk().build(); - - String part_directory = cloned_part.part->getDataPartStorage().getFullPath(); - - LOG_TRACE(log, "Part {} was cloned to {}", part->name, data_part_directory); + LOG_TRACE(log, "Part {} was cloned to {}", part->name, cloned_part.part->getDataPartStorage().getFullPath()); cloned_part.part->is_temp = true; cloned_part.part->loadColumnsChecksumsIndexes(true, true); @@ -263,34 +258,36 @@ MergeTreePartsMover::TemporaryClonedPart MergeTreePartsMover::clonePart(const Me } -void MergeTreePartsMover::swapClonedPart(const MergeTreeMutableDataPartPtr & cloned_part) const +void MergeTreePartsMover::swapClonedPart(TemporaryClonedPart & cloned_part) const { if (moves_blocker.isCancelled()) throw Exception(ErrorCodes::ABORTED, "Cancelled moving parts."); - auto active_part = data->getActiveContainingPart(cloned_part->name); + auto active_part = data->getActiveContainingPart(cloned_part.part->name); /// It's ok, because we don't block moving parts for merges or mutations - if (!active_part || active_part->name != cloned_part->name) + if (!active_part || active_part->name != cloned_part.part->name) { LOG_INFO(log, "Failed to swap {}. Active part doesn't exist (containing part {}). " "Possible it was merged or mutated. Will remove copy on path '{}'", - cloned_part->name, + cloned_part.part->name, active_part ? active_part->name : "doesn't exist", - cloned_part->getDataPartStorage().getFullPath()); + cloned_part.part->getDataPartStorage().getFullPath()); return; } - cloned_part->is_temp = false; + cloned_part.part->is_temp = false; /// Don't remove new directory but throw an error because it may contain part which is currently in use. - cloned_part->renameTo(active_part->name, false); + cloned_part.part->renameTo(active_part->name, false); /// TODO what happen if server goes down here? - data->swapActivePart(cloned_part); + data->swapActivePart(cloned_part.part); - LOG_TRACE(log, "Part {} was moved to {}", cloned_part->name, cloned_part->getDataPartStorage().getFullPath()); + LOG_TRACE(log, "Part {} was moved to {}", cloned_part.part->name, cloned_part.part->getDataPartStorage().getFullPath()); + + cloned_part.temporary_directory_lock = {}; } } diff --git a/src/Storages/MergeTree/MergeTreePartsMover.h b/src/Storages/MergeTree/MergeTreePartsMover.h index dde2ff1a630..82fd271ee5f 100644 --- a/src/Storages/MergeTree/MergeTreePartsMover.h +++ b/src/Storages/MergeTree/MergeTreePartsMover.h @@ -72,7 +72,7 @@ public: /// IMergeTreeDataPart called. If replacing part doesn't exists or not active (committed) than /// cloned part will be removed and log message will be reported. It may happen in case of concurrent /// merge or mutation. - void swapClonedPart(const MergeTreeMutableDataPartPtr & cloned_parts) const; + void swapClonedPart(TemporaryClonedPart & cloned_part) const; /// Can stop background moves and moves from queries ActionBlocker moves_blocker; From b410a4d44ce6f1ac1048efb565f87bae0e97c183 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 5 Jun 2023 18:23:24 +0000 Subject: [PATCH 0610/1072] Add test test_alter_moving_garbage --- .../test_alter_moving_garbage/__init__.py | 0 .../configs/config.d/storage_conf.xml | 26 ++++++ .../configs/config.xml | 7 ++ .../test_alter_moving_garbage/test.py | 90 +++++++++++++++++++ 4 files changed, 123 insertions(+) create mode 100644 tests/integration/test_alter_moving_garbage/__init__.py create mode 100644 tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf.xml create mode 100644 tests/integration/test_alter_moving_garbage/configs/config.xml create mode 100644 tests/integration/test_alter_moving_garbage/test.py diff --git a/tests/integration/test_alter_moving_garbage/__init__.py b/tests/integration/test_alter_moving_garbage/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf.xml b/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf.xml new file mode 100644 index 00000000000..659f59a41b2 --- /dev/null +++ b/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf.xml @@ -0,0 +1,26 @@ + + + + + + + s3 + http://minio1:9001/root/data/ + minio + minio123 + + + + + + + default + + + s3 + + + + + + diff --git a/tests/integration/test_alter_moving_garbage/configs/config.xml b/tests/integration/test_alter_moving_garbage/configs/config.xml new file mode 100644 index 00000000000..f4be5ab6b7c --- /dev/null +++ b/tests/integration/test_alter_moving_garbage/configs/config.xml @@ -0,0 +1,7 @@ + + 9000 + 127.0.0.1 + 500 + ./clickhouse/ + users.xml + diff --git a/tests/integration/test_alter_moving_garbage/test.py b/tests/integration/test_alter_moving_garbage/test.py new file mode 100644 index 00000000000..b369c9ad377 --- /dev/null +++ b/tests/integration/test_alter_moving_garbage/test.py @@ -0,0 +1,90 @@ +import logging +import time + +import pytest +import threading + +from helpers.client import QueryRuntimeException +from helpers.cluster import ClickHouseCluster + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance( + "node1", + main_configs=[ + "configs/config.d/storage_conf.xml", + ], + with_minio=True, + ) + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + finally: + cluster.shutdown() + + +def create_table(node, table_name, **additional_settings): + settings = { + "storage_policy": "two_disks", + "old_parts_lifetime": 1, + "index_granularity": 512, + "temporary_directories_lifetime": 0, + "merge_tree_clear_old_temporary_directories_interval_seconds": 1, + } + settings.update(additional_settings) + + create_table_statement = f""" + CREATE TABLE {table_name} ( + dt Date, + id Int64, + data String, + INDEX min_max (id) TYPE minmax GRANULARITY 3 + ) ENGINE=MergeTree() + PARTITION BY dt + ORDER BY (dt, id) + SETTINGS {",".join((k+"="+repr(v) for k, v in settings.items()))}""" + + node.query(create_table_statement) + + +def test_create_table(cluster): + node = cluster.instances["node1"] + create_table(node, "test_table") + node.query( + "INSERT INTO test_table SELECT toDate('2021-01-01') + INTERVAL number % 10 DAY, number, toString(sipHash64(number)) FROM numbers(100_000)" + ) + + stop_alter = False + + def alter(): + d = 0 + node.query(f"ALTER TABLE test_table ADD COLUMN col0 String") + while not stop_alter: + d = d + 1 + node.query(f"DELETE FROM test_table WHERE id < {d}") + time.sleep(0.1) + + alter_thread = threading.Thread(target=alter) + alter_thread.start() + + for i in range(1, 10): + partition = f"2021-01-{i:02d}" + try: + node.query( + f"ALTER TABLE test_table MOVE PARTITION '{partition}' TO DISK 's3'", + ) + except QueryRuntimeException as e: + # PART_IS_TEMPORARILY_LOCKED + assert 384 == e.returncode + continue + + # clear old temporary directories wakes up every 1 second + time.sleep(0.5) + + stop_alter = True + alter_thread.join() From ea1aa4bd9e312a36b578fff3ec3573ff0844d9d5 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Wed, 7 Jun 2023 16:02:16 +0200 Subject: [PATCH 0611/1072] update comment --- src/Client/Suggest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Client/Suggest.cpp b/src/Client/Suggest.cpp index e249aa1bb04..4e38add0ef5 100644 --- a/src/Client/Suggest.cpp +++ b/src/Client/Suggest.cpp @@ -101,7 +101,7 @@ static String getLoadSuggestionQuery(Int32 suggestion_limit, bool basic_suggesti add_column("name", "columns", true, suggestion_limit); } - /// FIXME: Forbid this query using new analyzer because of bug https://github.com/ClickHouse/ClickHouse/pull/50430#issuecomment-1576860893 + /// FIXME: Forbid this query using new analyzer because of bug https://github.com/ClickHouse/ClickHouse/issues/50669 /// We should remove this restriction after resolving this bug. query = "SELECT DISTINCT arrayJoin(extractAll(name, '[\\\\w_]{2,}')) AS res FROM (" + query + ") WHERE notEmpty(res) SETTINGS allow_experimental_analyzer=0"; return query; From 4fd64a28b20966246743ec6408a3018e8724249e Mon Sep 17 00:00:00 2001 From: Han Fei Date: Wed, 7 Jun 2023 16:07:18 +0200 Subject: [PATCH 0612/1072] and add more tests --- .../01763_filter_push_down_bugs.reference | 23 +++++++++++++++---- .../01763_filter_push_down_bugs.sql | 6 +++++ 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/01763_filter_push_down_bugs.reference b/tests/queries/0_stateless/01763_filter_push_down_bugs.reference index 7df35e2948d..db9cd7a2d16 100644 --- a/tests/queries/0_stateless/01763_filter_push_down_bugs.reference +++ b/tests/queries/0_stateless/01763_filter_push_down_bugs.reference @@ -9,13 +9,11 @@ String1_0 String2_0 String3_0 String4_0 1 Expression ((Projection + Before ORDER BY)) Filter (WHERE) Join (JOIN FillRightFirst) - Filter (( + Before JOIN)) + Expression (Before JOIN) ReadFromMergeTree (default.t1) Indexes: PrimaryKey - Keys: - id - Condition: (id in [101, 101]) + Condition: true Parts: 1/1 Granules: 1/1 Expression ((Joined actions + (Rename joined columns + (Projection + Before ORDER BY)))) @@ -25,3 +23,20 @@ Expression ((Projection + Before ORDER BY)) Condition: true Parts: 1/1 Granules: 1/1 +Expression ((Project names + Projection)) + Filter ((WHERE + DROP unused columns after JOIN)) + Join (JOIN FillRightFirst) + Expression (Change column names to column identifiers) + ReadFromMergeTree (default.t1) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 + Expression ((Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers)))) + ReadFromMergeTree (default.t2) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 diff --git a/tests/queries/0_stateless/01763_filter_push_down_bugs.sql b/tests/queries/0_stateless/01763_filter_push_down_bugs.sql index 5f7f4379714..9a5ef4727c5 100644 --- a/tests/queries/0_stateless/01763_filter_push_down_bugs.sql +++ b/tests/queries/0_stateless/01763_filter_push_down_bugs.sql @@ -58,5 +58,11 @@ EXPLAIN indexes=1 SELECT id, delete_time FROM t1 FROM t2 ) AS d WHERE create_time < delete_time AND id = 101 SETTINGS allow_experimental_analyzer=0; +EXPLAIN indexes=1 SELECT id, delete_time FROM t1 + CROSS JOIN ( + SELECT delete_time + FROM t2 +) AS d WHERE create_time < delete_time AND id = 101 SETTINGS allow_experimental_analyzer=1; + DROP TABLE IF EXISTS t1; DROP TABLE IF EXISTS t2; From 0dd75d7648d3ba12b9593b312ce428e1b12799f8 Mon Sep 17 00:00:00 2001 From: Victor Krasnov Date: Wed, 7 Jun 2023 17:50:20 +0300 Subject: [PATCH 0613/1072] Add 02783_parseDateTimeBestEffort_syslog test --- ...3_parseDateTimeBestEffort_syslog.reference | 20 +++++ .../02783_parseDateTimeBestEffort_syslog.sql | 83 +++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference create mode 100644 tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql diff --git a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference new file mode 100644 index 00000000000..7409b413260 --- /dev/null +++ b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference @@ -0,0 +1,20 @@ +parseDateTimeBestEffort + dt_ref res res_sam res_auc res_null res_null_sam res_null_auc res_zero res_zero_sam res_zero_auc + + Jun 7 04:55:00 2023-06-07 04:55:00 2022-06-07 04:55:00 2023-06-07 04:55:00 2023-06-07 04:55:00 2022-06-07 04:55:00 2023-06-07 04:55:00 2023-06-07 04:55:00 2022-06-07 04:55:00 2023-06-07 04:55:00 + Jun 7 04:56:00 2022-06-07 04:56:00 2022-06-07 04:56:00 2023-06-07 04:56:00 2022-06-07 04:56:00 2022-06-07 04:56:00 2023-06-07 04:56:00 2022-06-07 04:56:00 2022-06-07 04:56:00 2023-06-07 04:56:00 +parseDateTimeBestEffortUS + dt_ref res res_sam res_auc res_null res_null_sam res_null_auc res_zero res_zero_sam res_zero_auc + + Jun 7 04:55:00 2023-06-07 04:55:00 2022-06-07 04:55:00 2023-06-07 04:55:00 2023-06-07 04:55:00 2022-06-07 04:55:00 2023-06-07 04:55:00 2023-06-07 04:55:00 2022-06-07 04:55:00 2023-06-07 04:55:00 + Jun 7 04:56:00 2022-06-07 04:56:00 2022-06-07 04:56:00 2023-06-07 04:56:00 2022-06-07 04:56:00 2022-06-07 04:56:00 2023-06-07 04:56:00 2022-06-07 04:56:00 2022-06-07 04:56:00 2023-06-07 04:56:00 +parseDateTime64BestEffort + dt_ref res res_sam res_auc res_null res_null_sam res_null_auc res_zero res_zero_sam res_zero_auc + + Jun 7 04:55:00 2023-06-07 04:55:00.000 2022-06-07 04:55:00.000 2023-06-07 04:55:00.000 2023-06-07 04:55:00.000 2022-06-07 04:55:00.000 2023-06-07 04:55:00.000 2023-06-07 04:55:00.000 2022-06-07 04:55:00.000 2023-06-07 04:55:00.000 + Jun 7 04:56:00 2022-06-07 04:56:00.000 2022-06-07 04:56:00.000 2023-06-07 04:56:00.000 2022-06-07 04:56:00.000 2022-06-07 04:56:00.000 2023-06-07 04:56:00.000 2022-06-07 04:56:00.000 2022-06-07 04:56:00.000 2023-06-07 04:56:00.000 +parseDateTime64BestEffortUS + dt_ref res res_sam res_auc res_null res_null_sam res_null_auc res_zero res_zero_sam res_zero_auc + + Jun 7 04:55:00 2023-06-07 04:55:00.000 2022-06-07 04:55:00.000 2023-06-07 04:55:00.000 2023-06-07 04:55:00.000 2022-06-07 04:55:00.000 2023-06-07 04:55:00.000 2023-06-07 04:55:00.000 2022-06-07 04:55:00.000 2023-06-07 04:55:00.000 + Jun 7 04:56:00 2022-06-07 04:56:00.000 2022-06-07 04:56:00.000 2023-06-07 04:56:00.000 2022-06-07 04:56:00.000 2022-06-07 04:56:00.000 2023-06-07 04:56:00.000 2022-06-07 04:56:00.000 2022-06-07 04:56:00.000 2023-06-07 04:56:00.000 diff --git a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql new file mode 100644 index 00000000000..91ae230205b --- /dev/null +++ b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql @@ -0,0 +1,83 @@ +SELECT 'parseDateTimeBestEffort'; + +WITH + now() AS ts_now, + '2023-06-07 04:55:30' AS ref_point, + dateDiff('second', toDateTime(ref_point), ts_now) AS impedimenta, + formatDateTime(ts_around, '%b %e %T') AS dt_curr +SELECT + formatDateTime(ts_around - impedimenta, '%b %e %H:%i:%s') AS dt_ref, + parseDateTimeBestEffort(dt_curr) - impedimenta AS res, + parseDateTimeBestEffort(dt_curr, 'US/Samoa') - impedimenta AS res_sam, + parseDateTimeBestEffort(dt_curr, 'Pacific/Auckland') - impedimenta AS res_auc, + parseDateTimeBestEffortOrNull(dt_curr) - impedimenta AS res_null, + parseDateTimeBestEffortOrNull(dt_curr, 'US/Samoa') - impedimenta AS res_null_sam, + parseDateTimeBestEffortOrNull(dt_curr, 'Pacific/Auckland') - impedimenta AS res_null_auc, + parseDateTimeBestEffortOrZero(dt_curr) - impedimenta AS res_zero, + parseDateTimeBestEffortOrZero(dt_curr, 'US/Samoa') - impedimenta AS res_zero_sam, + parseDateTimeBestEffortOrZero(dt_curr, 'Pacific/Auckland') - impedimenta AS res_zero_auc +FROM (SELECT arrayJoin([ts_now - 30, ts_now + 30]) AS ts_around) +FORMAT PrettySpaceNoEscapes; + +SELECT 'parseDateTimeBestEffortUS'; + +WITH + now() AS ts_now, + '2023-06-07 04:55:30' AS ref_point, + dateDiff('second', toDateTime(ref_point), ts_now) AS impedimenta, + formatDateTime(ts_around, '%b %e %T') AS dt_curr +SELECT + formatDateTime(ts_around - impedimenta, '%b %e %H:%i:%s') AS dt_ref, + parseDateTimeBestEffortUS(dt_curr) - impedimenta AS res, + parseDateTimeBestEffortUS(dt_curr, 'US/Samoa') - impedimenta AS res_sam, + parseDateTimeBestEffortUS(dt_curr, 'Pacific/Auckland') - impedimenta AS res_auc, + parseDateTimeBestEffortUSOrNull(dt_curr) - impedimenta AS res_null, + parseDateTimeBestEffortUSOrNull(dt_curr, 'US/Samoa') - impedimenta AS res_null_sam, + parseDateTimeBestEffortUSOrNull(dt_curr, 'Pacific/Auckland') - impedimenta AS res_null_auc, + parseDateTimeBestEffortUSOrZero(dt_curr) - impedimenta AS res_zero, + parseDateTimeBestEffortUSOrZero(dt_curr, 'US/Samoa') - impedimenta AS res_zero_sam, + parseDateTimeBestEffortUSOrZero(dt_curr, 'Pacific/Auckland') - impedimenta AS res_zero_auc +FROM (SELECT arrayJoin([ts_now - 30, ts_now + 30]) AS ts_around) +FORMAT PrettySpaceNoEscapes; + +SELECT 'parseDateTime64BestEffort'; + +WITH + now() AS ts_now, + '2023-06-07 04:55:30' AS ref_point, + dateDiff('second', toDateTime(ref_point), ts_now) AS impedimenta, + formatDateTime(ts_around, '%b %e %T') AS dt_curr +SELECT + formatDateTime(ts_around - impedimenta, '%b %e %H:%i:%s') AS dt_ref, + parseDateTime64BestEffort(dt_curr) - impedimenta AS res, + parseDateTime64BestEffort(dt_curr, 3, 'US/Samoa') - impedimenta AS res_sam, + parseDateTime64BestEffort(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res_auc, + parseDateTime64BestEffortOrNull(dt_curr) - impedimenta AS res_null, + parseDateTime64BestEffortOrNull(dt_curr, 3, 'US/Samoa') - impedimenta AS res_null_sam, + parseDateTime64BestEffortOrNull(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res_null_auc, + parseDateTime64BestEffortOrZero(dt_curr) - impedimenta AS res_zero, + parseDateTime64BestEffortOrZero(dt_curr, 3, 'US/Samoa') - impedimenta AS res_zero_sam, + parseDateTime64BestEffortOrZero(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res_zero_auc +FROM (SELECT arrayJoin([ts_now - 30, ts_now + 30]) AS ts_around) +FORMAT PrettySpaceNoEscapes; + +SELECT 'parseDateTime64BestEffortUS'; + +WITH + now() AS ts_now, + '2023-06-07 04:55:30' AS ref_point, + dateDiff('second', toDateTime(ref_point), ts_now) AS impedimenta, + formatDateTime(ts_around, '%b %e %T') AS dt_curr +SELECT + formatDateTime(ts_around - impedimenta, '%b %e %H:%i:%s') AS dt_ref, + parseDateTime64BestEffortUS(dt_curr) - impedimenta AS res, + parseDateTime64BestEffortUS(dt_curr, 3, 'US/Samoa') - impedimenta AS res_sam, + parseDateTime64BestEffortUS(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res_auc, + parseDateTime64BestEffortUSOrNull(dt_curr) - impedimenta AS res_null, + parseDateTime64BestEffortUSOrNull(dt_curr, 3, 'US/Samoa') - impedimenta AS res_null_sam, + parseDateTime64BestEffortUSOrNull(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res_null_auc, + parseDateTime64BestEffortUSOrZero(dt_curr) - impedimenta AS res_zero, + parseDateTime64BestEffortUSOrZero(dt_curr, 3, 'US/Samoa') - impedimenta AS res_zero_sam, + parseDateTime64BestEffortUSOrZero(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res_zero_auc +FROM (SELECT arrayJoin([ts_now - 30, ts_now + 30]) AS ts_around) +FORMAT PrettySpaceNoEscapes; From bf6900f64ca7614a686dfaa56f87c84c43408506 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Wed, 7 Jun 2023 17:08:18 +0200 Subject: [PATCH 0614/1072] Write 1 part and do not use OPTIMIZE FINAL --- ...e_row_level_policy_lightweight_delete.sql.j2 | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/tests/queries/0_stateless/02461_prewhere_row_level_policy_lightweight_delete.sql.j2 b/tests/queries/0_stateless/02461_prewhere_row_level_policy_lightweight_delete.sql.j2 index 1e4258cef7e..0ec6b2ed144 100644 --- a/tests/queries/0_stateless/02461_prewhere_row_level_policy_lightweight_delete.sql.j2 +++ b/tests/queries/0_stateless/02461_prewhere_row_level_policy_lightweight_delete.sql.j2 @@ -8,25 +8,10 @@ ORDER BY (SiteId, DateVisit) SETTINGS index_granularity = {{ index_granularity }}, min_bytes_for_wide_part = 0; -- Insert some data to have 110K rows in the range 2022-08-10 .. 2022-08-20 and some more rows before and after that range -insert into url_na_log select 209, '2022-08-09' from numbers(10000); -insert into url_na_log select 209, '2022-08-10' from numbers(10000); -insert into url_na_log select 209, '2022-08-11' from numbers(10000); -insert into url_na_log select 209, '2022-08-12' from numbers(10000); -insert into url_na_log select 209, '2022-08-13' from numbers(10000); -insert into url_na_log select 209, '2022-08-14' from numbers(10000); -insert into url_na_log select 209, '2022-08-15' from numbers(10000); -insert into url_na_log select 209, '2022-08-16' from numbers(10000); -insert into url_na_log select 209, '2022-08-17' from numbers(10000); -insert into url_na_log select 209, '2022-08-18' from numbers(10000); -insert into url_na_log select 209, '2022-08-19' from numbers(10000); -insert into url_na_log select 209, '2022-08-20' from numbers(10000); -insert into url_na_log select 209, '2022-08-21' from numbers(10000); - +insert into url_na_log select 209, ('2022-08-09'::Date + INTERVAL intDiv(number,10000) DAY) from numbers(130000) SETTINGS max_insert_block_size=200000; SET mutations_sync=2; -OPTIMIZE TABLE url_na_log FINAL; - -- { echoOn } SELECT count() FROM url_na_log; From a67dd6e47947295bd853ee1d26ad66a94861dabe Mon Sep 17 00:00:00 2001 From: Alexander Sapin Date: Wed, 7 Jun 2023 17:25:48 +0200 Subject: [PATCH 0615/1072] Readuntilend --- src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp | 16 ++++++++++++++++ src/Disks/IO/ReadBufferFromAzureBlobStorage.h | 1 + 2 files changed, 17 insertions(+) diff --git a/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp b/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp index a086eb0a6df..129bb97be09 100644 --- a/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp +++ b/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp @@ -56,6 +56,22 @@ ReadBufferFromAzureBlobStorage::ReadBufferFromAzureBlobStorage( } } + +void ReadBufferFromAzureBlobStorage::setReadUntilEnd() +{ + if (read_until_position) + { + read_until_position = 0; + if (initialized) + { + offset = getPosition(); + resetWorkingBuffer(); + initialized = false; + } + } + +} + void ReadBufferFromAzureBlobStorage::setReadUntilPosition(size_t position) { read_until_position = position; diff --git a/src/Disks/IO/ReadBufferFromAzureBlobStorage.h b/src/Disks/IO/ReadBufferFromAzureBlobStorage.h index 599ecba1dd1..4e21f543653 100644 --- a/src/Disks/IO/ReadBufferFromAzureBlobStorage.h +++ b/src/Disks/IO/ReadBufferFromAzureBlobStorage.h @@ -38,6 +38,7 @@ public: String getFileName() const override { return path; } void setReadUntilPosition(size_t position) override; + void setReadUntilEnd() override; bool supportsRightBoundedReads() const override { return true; } From cf65ac499c8f8cc8e49fa9654bc45723a99d3a6d Mon Sep 17 00:00:00 2001 From: Alexander Sapin Date: Wed, 7 Jun 2023 17:34:26 +0200 Subject: [PATCH 0616/1072] Fix iterator --- src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp index fd6452b7c2a..f91c19f2fb9 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp @@ -25,7 +25,6 @@ void IObjectStorageIteratorAsync::nextBatch() current_batch = std::move(next_batch.batch); accumulated_size.fetch_add(current_batch.size(), std::memory_order_relaxed); current_batch_iterator = current_batch.begin(); - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "HAS NEXT {}", next_batch.has_next); if (next_batch.has_next) outcome_future = scheduleBatch(); else @@ -78,6 +77,7 @@ bool IObjectStorageIteratorAsync::isValid() if (!is_initialized) nextBatch(); + std::lock_guard lock(mutex); return current_batch_iterator != current_batch.end(); } @@ -86,16 +86,17 @@ RelativePathWithMetadata IObjectStorageIteratorAsync::current() if (!isValid()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to access invalid iterator"); + std::lock_guard lock(mutex); return *current_batch_iterator; } RelativePathsWithMetadata IObjectStorageIteratorAsync::currentBatch() { - std::lock_guard lock(mutex); if (!isValid()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to access invalid iterator"); + std::lock_guard lock(mutex); return current_batch; } From b567dc2a1dce59d3ced34463601e63326d56aa50 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Wed, 7 Jun 2023 17:48:06 +0200 Subject: [PATCH 0617/1072] fix test --- .../0_stateless/01763_filter_push_down_bugs.reference | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/01763_filter_push_down_bugs.reference b/tests/queries/0_stateless/01763_filter_push_down_bugs.reference index db9cd7a2d16..c8045dd26f5 100644 --- a/tests/queries/0_stateless/01763_filter_push_down_bugs.reference +++ b/tests/queries/0_stateless/01763_filter_push_down_bugs.reference @@ -9,11 +9,13 @@ String1_0 String2_0 String3_0 String4_0 1 Expression ((Projection + Before ORDER BY)) Filter (WHERE) Join (JOIN FillRightFirst) - Expression (Before JOIN) + Filter (( + Before JOIN)) ReadFromMergeTree (default.t1) Indexes: PrimaryKey - Condition: true + Keys: + id + Condition: (id in [101, 101]) Parts: 1/1 Granules: 1/1 Expression ((Joined actions + (Rename joined columns + (Projection + Before ORDER BY)))) @@ -30,7 +32,9 @@ Expression ((Project names + Projection)) ReadFromMergeTree (default.t1) Indexes: PrimaryKey - Condition: true + Keys: + id + Condition: (id in [101, 101]) Parts: 1/1 Granules: 1/1 Expression ((Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers)))) From b78e330129575d9b77f9da5cb6511eee56d5deaf Mon Sep 17 00:00:00 2001 From: Alexander Sapin Date: Wed, 7 Jun 2023 18:09:55 +0200 Subject: [PATCH 0618/1072] Better test --- .../integration/test_storage_azure_blob_storage/test.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 8a0a68f5200..319500e6226 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -126,6 +126,7 @@ def test_simple_write_named_collection_1(cluster): ) print(get_azure_file_content("test_simple_write_named.csv")) assert get_azure_file_content("test_simple_write_named.csv") == '1,"a"\n' + azure_query(node, "TRUNCATE TABLE test_simple_write_named_collection_1") def test_simple_write_named_collection_2(cluster): @@ -487,6 +488,14 @@ def test_simple_write_named_collection_1_table_function(cluster): print(get_azure_file_content("test_simple_write_named.csv")) assert get_azure_file_content("test_simple_write_named.csv") == '1,"a"\n' + azure_query( + node, + "CREATE TABLE drop_table (key UInt64, data String) Engine = AzureBlobStorage(azure_conf1)", + ) + + azure_query( + node, "TRUNCATE TABLE drop_table", + ) def test_simple_write_named_collection_2_table_function(cluster): node = cluster.instances["node"] From 8d67296e3aee48f92ec82fd55637dfe16122c02d Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Wed, 7 Jun 2023 16:22:42 +0000 Subject: [PATCH 0619/1072] Automatic style fix --- tests/integration/test_storage_azure_blob_storage/test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 319500e6226..2c052c005ca 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -494,9 +494,11 @@ def test_simple_write_named_collection_1_table_function(cluster): ) azure_query( - node, "TRUNCATE TABLE drop_table", + node, + "TRUNCATE TABLE drop_table", ) + def test_simple_write_named_collection_2_table_function(cluster): node = cluster.instances["node"] From c12c62a7fb7072772f423c907c467c39762ec5a7 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 7 Jun 2023 18:22:53 +0200 Subject: [PATCH 0620/1072] Update src/Parsers/ParserCreateIndexQuery.cpp Co-authored-by: Nikolay Degterinsky <43110995+evillique@users.noreply.github.com> --- src/Parsers/ParserCreateIndexQuery.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/Parsers/ParserCreateIndexQuery.cpp b/src/Parsers/ParserCreateIndexQuery.cpp index e878b347e62..7323c5da141 100644 --- a/src/Parsers/ParserCreateIndexQuery.cpp +++ b/src/Parsers/ParserCreateIndexQuery.cpp @@ -42,9 +42,6 @@ bool ParserCreateIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected return false; } - if (!granularity_p.parse(pos, granularity, expected)) - return false; - auto index = std::make_shared(); index->part_of_create_index_query = true; index->set(index->expr, expr); From 2df0e0c66962ce91e47eb4c98bae4fabe1ce9cc1 Mon Sep 17 00:00:00 2001 From: serxa Date: Wed, 7 Jun 2023 16:25:52 +0000 Subject: [PATCH 0621/1072] Unify priorities for connections --- src/Client/ConnectionPool.cpp | 4 ++-- src/Client/ConnectionPool.h | 13 +++++++------ src/Client/ConnectionPoolWithFailover.cpp | 2 +- src/Client/ConnectionPoolWithFailover.h | 2 +- src/Common/GetPriorityForLoadBalancing.cpp | 15 ++++++++------- src/Common/GetPriorityForLoadBalancing.h | 2 +- src/Common/PoolWithFailoverBase.h | 9 +++++---- src/Common/ZooKeeper/ZooKeeper.cpp | 2 +- src/Common/ZooKeeper/ZooKeeper.h | 2 +- src/Databases/DatabaseReplicated.cpp | 2 +- src/Functions/hasColumnInTable.cpp | 2 +- .../tests/gtest_resource_manager_static.cpp | 4 ++-- src/Interpreters/Cluster.cpp | 2 +- src/Interpreters/Cluster.h | 5 +++-- src/Interpreters/ClusterDiscovery.cpp | 2 +- src/TableFunctions/TableFunctionRemote.cpp | 2 +- 16 files changed, 37 insertions(+), 33 deletions(-) diff --git a/src/Client/ConnectionPool.cpp b/src/Client/ConnectionPool.cpp index 8433b0833fa..5cabb1465d1 100644 --- a/src/Client/ConnectionPool.cpp +++ b/src/Client/ConnectionPool.cpp @@ -18,7 +18,7 @@ ConnectionPoolPtr ConnectionPoolFactory::get( String client_name, Protocol::Compression compression, Protocol::Secure secure, - Int64 priority) + Priority priority) { Key key{ max_connections, host, port, default_database, user, password, quota_key, cluster, cluster_secret, client_name, compression, secure, priority}; @@ -74,7 +74,7 @@ size_t ConnectionPoolFactory::KeyHash::operator()(const ConnectionPoolFactory::K hash_combine(seed, hash_value(k.client_name)); hash_combine(seed, hash_value(k.compression)); hash_combine(seed, hash_value(k.secure)); - hash_combine(seed, hash_value(k.priority)); + hash_combine(seed, hash_value(k.priority.value)); return seed; } diff --git a/src/Client/ConnectionPool.h b/src/Client/ConnectionPool.h index aacd0a063c7..b6d03daacfb 100644 --- a/src/Client/ConnectionPool.h +++ b/src/Client/ConnectionPool.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -34,7 +35,7 @@ public: const Settings * settings = nullptr, bool force_connected = true) = 0; - virtual Int64 getPriority() const { return 1; } + virtual Priority getPriority() const { return Priority{1}; } }; using ConnectionPoolPtr = std::shared_ptr; @@ -60,7 +61,7 @@ public: const String & client_name_, Protocol::Compression compression_, Protocol::Secure secure_, - Int64 priority_ = 1) + Priority priority_ = Priority{1}) : Base(max_connections_, &Poco::Logger::get("ConnectionPool (" + host_ + ":" + toString(port_) + ")")), host(host_), @@ -103,7 +104,7 @@ public: return host + ":" + toString(port); } - Int64 getPriority() const override + Priority getPriority() const override { return priority; } @@ -134,7 +135,7 @@ private: String client_name; Protocol::Compression compression; /// Whether to compress data when interacting with the server. Protocol::Secure secure; /// Whether to encrypt data when interacting with the server. - Int64 priority; /// priority from + Priority priority; /// priority from }; /** @@ -157,7 +158,7 @@ public: String client_name; Protocol::Compression compression; Protocol::Secure secure; - Int64 priority; + Priority priority; }; struct KeyHash @@ -180,7 +181,7 @@ public: String client_name, Protocol::Compression compression, Protocol::Secure secure, - Int64 priority); + Priority priority); private: mutable std::mutex mutex; using ConnectionPoolWeakPtr = std::weak_ptr; diff --git a/src/Client/ConnectionPoolWithFailover.cpp b/src/Client/ConnectionPoolWithFailover.cpp index 129bc10bc27..feb4c01c374 100644 --- a/src/Client/ConnectionPoolWithFailover.cpp +++ b/src/Client/ConnectionPoolWithFailover.cpp @@ -71,7 +71,7 @@ IConnectionPool::Entry ConnectionPoolWithFailover::get(const ConnectionTimeouts return Base::get(max_ignored_errors, fallback_to_stale_replicas, try_get_entry, get_priority); } -Int64 ConnectionPoolWithFailover::getPriority() const +Priority ConnectionPoolWithFailover::getPriority() const { return (*std::max_element(nested_pools.begin(), nested_pools.end(), [](const auto & a, const auto & b) { diff --git a/src/Client/ConnectionPoolWithFailover.h b/src/Client/ConnectionPoolWithFailover.h index 0273ce41589..75a0dafd977 100644 --- a/src/Client/ConnectionPoolWithFailover.h +++ b/src/Client/ConnectionPoolWithFailover.h @@ -48,7 +48,7 @@ public: const Settings * settings, bool force_connected) override; /// From IConnectionPool - Int64 getPriority() const override; /// From IConnectionPool + Priority getPriority() const override; /// From IConnectionPool /** Allocates up to the specified number of connections to work. * Connections provide access to different replicas of one shard. diff --git a/src/Common/GetPriorityForLoadBalancing.cpp b/src/Common/GetPriorityForLoadBalancing.cpp index 5da60fb1bae..c4d36acc70c 100644 --- a/src/Common/GetPriorityForLoadBalancing.cpp +++ b/src/Common/GetPriorityForLoadBalancing.cpp @@ -1,4 +1,5 @@ #include +#include namespace DB { @@ -8,23 +9,23 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -std::function GetPriorityForLoadBalancing::getPriorityFunc(LoadBalancing load_balance, size_t offset, size_t pool_size) const +std::function GetPriorityForLoadBalancing::getPriorityFunc(LoadBalancing load_balance, size_t offset, size_t pool_size) const { - std::function get_priority; + std::function get_priority; switch (load_balance) { case LoadBalancing::NEAREST_HOSTNAME: if (hostname_differences.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "It's a bug: hostname_differences is not initialized"); - get_priority = [this](size_t i) { return hostname_differences[i]; }; + get_priority = [this](size_t i) { return Priority{static_cast(hostname_differences[i])}; }; break; case LoadBalancing::IN_ORDER: - get_priority = [](size_t i) { return i; }; + get_priority = [](size_t i) { return Priority{static_cast(i)}; }; break; case LoadBalancing::RANDOM: break; case LoadBalancing::FIRST_OR_RANDOM: - get_priority = [offset](size_t i) -> size_t { return i != offset; }; + get_priority = [offset](size_t i) { return i != offset ? Priority{1} : Priority{0}; }; break; case LoadBalancing::ROUND_ROBIN: if (last_used >= pool_size) @@ -38,8 +39,8 @@ std::function GetPriorityForLoadBalancing::getPriorityFunc * */ get_priority = [this, pool_size](size_t i) { - ++i; - return i < last_used ? pool_size - i : i - last_used; + ++i; // To make `i` indexing start with 1 instead of 0 as `last_used` does + return Priority{static_cast(i < last_used ? pool_size - i : i - last_used)}; }; break; } diff --git a/src/Common/GetPriorityForLoadBalancing.h b/src/Common/GetPriorityForLoadBalancing.h index e57b02b5e90..8052185ac13 100644 --- a/src/Common/GetPriorityForLoadBalancing.h +++ b/src/Common/GetPriorityForLoadBalancing.h @@ -21,7 +21,7 @@ public: return !(*this == other); } - std::function getPriorityFunc(LoadBalancing load_balance, size_t offset, size_t pool_size) const; + std::function getPriorityFunc(LoadBalancing load_balance, size_t offset, size_t pool_size) const; std::vector hostname_differences; /// Distances from name of this host to the names of hosts of pools. diff --git a/src/Common/PoolWithFailoverBase.h b/src/Common/PoolWithFailoverBase.h index 646e10d6443..c6f44a7701a 100644 --- a/src/Common/PoolWithFailoverBase.h +++ b/src/Common/PoolWithFailoverBase.h @@ -13,6 +13,7 @@ #include #include #include +#include namespace DB @@ -34,7 +35,7 @@ namespace ProfileEvents /// This class provides a pool with fault tolerance. It is used for pooling of connections to replicated DB. /// Initialized by several PoolBase objects. /// When a connection is requested, tries to create or choose an alive connection from one of the nested pools. -/// Pools are tried in the order consistent with lexicographical order of (error count, priority, random number) tuples. +/// Pools are tried in the order consistent with lexicographical order of (error count, slowdown count, config priority, priority, random number) tuples. /// Number of tries for a single pool is limited by max_tries parameter. /// The client can set nested pool priority by passing a GetPriority functor. /// @@ -113,7 +114,7 @@ public: /// The client can provide this functor to affect load balancing - the index of a pool is passed to /// this functor. The pools with lower result value will be tried first. - using GetPriorityFunc = std::function; + using GetPriorityFunc = std::function; /// Returns at least min_entries and at most max_entries connections (at most one connection per nested pool). /// The method will throw if it is unable to get min_entries alive connections or @@ -336,9 +337,9 @@ struct PoolWithFailoverBase::PoolState /// The number of slowdowns that led to changing replica in HedgedRequestsFactory UInt64 slowdown_count = 0; /// Priority from the configuration. - Int64 config_priority = 1; + Priority config_priority{1}; /// Priority from the GetPriorityFunc. - Int64 priority = 0; + Priority priority{0}; UInt64 random = 0; void randomize() diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index a587ad6caf4..62807fe2433 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -179,7 +179,7 @@ ZooKeeper::ZooKeeper(const Poco::Util::AbstractConfiguration & config, const std std::vector ZooKeeper::shuffleHosts() const { - std::function get_priority = args.get_priority_load_balancing.getPriorityFunc(args.get_priority_load_balancing.load_balancing, 0, args.hosts.size()); + std::function get_priority = args.get_priority_load_balancing.getPriorityFunc(args.get_priority_load_balancing.load_balancing, 0, args.hosts.size()); std::vector shuffle_hosts; for (size_t i = 0; i < args.hosts.size(); ++i) { diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index 96f9914b597..d48ca0a4ef5 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -49,7 +49,7 @@ constexpr size_t MULTI_BATCH_SIZE = 100; struct ShuffleHost { String host; - Int64 priority = 0; + Priority priority; UInt64 random = 0; void randomize() diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 9bbf5b9565d..583607bda1d 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -252,7 +252,7 @@ ClusterPtr DatabaseReplicated::getClusterImpl() const treat_local_as_remote, treat_local_port_as_remote, cluster_auth_info.cluster_secure_connection, - /*priority=*/ 1, + Priority{1}, TSA_SUPPRESS_WARNING_FOR_READ(database_name), /// FIXME cluster_auth_info.cluster_secret}; diff --git a/src/Functions/hasColumnInTable.cpp b/src/Functions/hasColumnInTable.cpp index 4676b4083b7..66ed515e490 100644 --- a/src/Functions/hasColumnInTable.cpp +++ b/src/Functions/hasColumnInTable.cpp @@ -137,7 +137,7 @@ ColumnPtr FunctionHasColumnInTable::executeImpl(const ColumnsWithTypeAndName & a treat_local_as_remote, treat_local_port_as_remote, /* secure= */ false, - /* priority= */ 1, + /* priority= */ Priority{1}, /* cluster_name= */ "", /* password= */ "" }; diff --git a/src/IO/Resource/tests/gtest_resource_manager_static.cpp b/src/IO/Resource/tests/gtest_resource_manager_static.cpp index 091f6923714..976eac41a49 100644 --- a/src/IO/Resource/tests/gtest_resource_manager_static.cpp +++ b/src/IO/Resource/tests/gtest_resource_manager_static.cpp @@ -44,8 +44,8 @@ TEST(IOResourceStaticResourceManager, Smoke) TEST(IOResourceStaticResourceManager, Prioritization) { - std::optional last_priority; - auto check = [&] (Int64 priority) + std::optional last_priority; + auto check = [&] (Priority priority) { // Lock is not required here because this is called during request execution and we have max_requests = 1 if (last_priority) diff --git a/src/Interpreters/Cluster.cpp b/src/Interpreters/Cluster.cpp index 8c30dbe230f..edbef77ef02 100644 --- a/src/Interpreters/Cluster.cpp +++ b/src/Interpreters/Cluster.cpp @@ -108,7 +108,7 @@ Cluster::Address::Address( password = config.getString(config_prefix + ".password", ""); default_database = config.getString(config_prefix + ".default_database", ""); secure = ConfigHelper::getBool(config, config_prefix + ".secure", false, /* empty_as */true) ? Protocol::Secure::Enable : Protocol::Secure::Disable; - priority = config.getInt(config_prefix + ".priority", 1); + priority = Priority{config.getInt(config_prefix + ".priority", 1)}; const char * port_type = secure == Protocol::Secure::Enable ? "tcp_port_secure" : "tcp_port"; auto default_port = config.getInt(port_type, 0); diff --git a/src/Interpreters/Cluster.h b/src/Interpreters/Cluster.h index 4798384f29c..de10a445d01 100644 --- a/src/Interpreters/Cluster.h +++ b/src/Interpreters/Cluster.h @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -44,7 +45,7 @@ struct ClusterConnectionParameters bool treat_local_as_remote; bool treat_local_port_as_remote; bool secure = false; - Int64 priority = 1; + Priority priority{1}; String cluster_name; String cluster_secret; }; @@ -131,7 +132,7 @@ public: Protocol::Compression compression = Protocol::Compression::Enable; Protocol::Secure secure = Protocol::Secure::Disable; - Int64 priority = 1; + Priority priority{1}; Address() = default; diff --git a/src/Interpreters/ClusterDiscovery.cpp b/src/Interpreters/ClusterDiscovery.cpp index 884e3b87343..553488edf50 100644 --- a/src/Interpreters/ClusterDiscovery.cpp +++ b/src/Interpreters/ClusterDiscovery.cpp @@ -246,7 +246,7 @@ ClusterPtr ClusterDiscovery::makeCluster(const ClusterInfo & cluster_info) /* treat_local_as_remote= */ false, /* treat_local_port_as_remote= */ false, /// should be set only for clickhouse-local, but cluster discovery is not used there /* secure= */ secure, - /* priority= */ 1, + /* priority= */ Priority{1}, /* cluster_name= */ "", /* password= */ ""}; auto cluster = std::make_shared( diff --git a/src/TableFunctions/TableFunctionRemote.cpp b/src/TableFunctions/TableFunctionRemote.cpp index b2f09adf773..4143014a7b3 100644 --- a/src/TableFunctions/TableFunctionRemote.cpp +++ b/src/TableFunctions/TableFunctionRemote.cpp @@ -262,7 +262,7 @@ void TableFunctionRemote::parseArguments(const ASTPtr & ast_function, ContextPtr treat_local_as_remote, treat_local_port_as_remote, secure, - /* priority= */ 1, + /* priority= */ Priority{1}, /* cluster_name= */ "", /* password= */ "" }; From 32372967e9814e629cbad2ce2ff57f82aba86e97 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 7 Jun 2023 16:55:14 +0200 Subject: [PATCH 0622/1072] fix --- src/Storages/StorageReplicatedMergeTree.cpp | 10 +++------- src/Storages/StorageReplicatedMergeTree.h | 2 +- .../0_stateless/02432_s3_parallel_parts_cleanup.sql | 6 +----- .../0_stateless/02448_clone_replica_lost_part.sql | 9 ++++++--- 4 files changed, 11 insertions(+), 16 deletions(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 6edd7531ec1..36bc3476e91 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -6743,14 +6743,12 @@ size_t StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZK() if (parts.empty()) return total_parts_to_remove; - size_t res = 0; - NOEXCEPT_SCOPE({ res = clearOldPartsAndRemoveFromZKImpl(zookeeper, std::move(parts)); }); - return res; + NOEXCEPT_SCOPE({ clearOldPartsAndRemoveFromZKImpl(zookeeper, std::move(parts)); }); + return total_parts_to_remove; } -size_t StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZKImpl(zkutil::ZooKeeperPtr zookeeper, DataPartsVector && parts) +void StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZKImpl(zkutil::ZooKeeperPtr zookeeper, DataPartsVector && parts) { - DataPartsVector parts_to_delete_only_from_filesystem; // Only duplicates DataPartsVector parts_to_delete_completely; // All parts except duplicates DataPartsVector parts_to_retry_deletion; // Parts that should be retried due to network problems @@ -6861,8 +6859,6 @@ size_t StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZKImpl(zkutil::ZooK /// Otherwise nobody will try to remove them again (see grabOldParts). delete_parts_from_fs_and_rollback_in_case_of_error(parts_to_remove_from_filesystem, "old"); } - - return total_parts_to_remove; } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 368d7d1b948..290266ca00c 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -344,7 +344,7 @@ private: /// Delete old parts from disk and from ZooKeeper. Returns the number of removed parts size_t clearOldPartsAndRemoveFromZK(); - size_t clearOldPartsAndRemoveFromZKImpl(zkutil::ZooKeeperPtr zookeeper, DataPartsVector && parts); + void clearOldPartsAndRemoveFromZKImpl(zkutil::ZooKeeperPtr zookeeper, DataPartsVector && parts); template friend class ReplicatedMergeTreeSinkImpl; diff --git a/tests/queries/0_stateless/02432_s3_parallel_parts_cleanup.sql b/tests/queries/0_stateless/02432_s3_parallel_parts_cleanup.sql index 3f8aa545298..948ec9e9e8a 100644 --- a/tests/queries/0_stateless/02432_s3_parallel_parts_cleanup.sql +++ b/tests/queries/0_stateless/02432_s3_parallel_parts_cleanup.sql @@ -38,11 +38,7 @@ select count(), sum(n), sum(m) from rmt; -- New table can assign merges/mutations and can remove old parts create table rmt2 (n int, m int, k String) engine=ReplicatedMergeTree('/test/02432/{database}', '2') order by tuple() settings storage_policy = 's3_cache', allow_remote_fs_zero_copy_replication=1, -<<<<<<< HEAD - max_part_removal_threads=10, concurrent_part_removal_threshold=1, cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0, -======= - concurrent_part_removal_threshold=1, cleanup_delay_period=1, cleanup_delay_period_random_add=1, ->>>>>>> master + concurrent_part_removal_threshold=1, cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0, min_bytes_for_wide_part=0, min_rows_for_wide_part=0, max_replicated_merges_in_queue=1, old_parts_lifetime=0; diff --git a/tests/queries/0_stateless/02448_clone_replica_lost_part.sql b/tests/queries/0_stateless/02448_clone_replica_lost_part.sql index 1e99e1869cc..eb4d0f255a7 100644 --- a/tests/queries/0_stateless/02448_clone_replica_lost_part.sql +++ b/tests/queries/0_stateless/02448_clone_replica_lost_part.sql @@ -7,11 +7,11 @@ drop table if exists rmt2; create table rmt1 (n int) engine=ReplicatedMergeTree('/test/02448/{database}/rmt', '1') order by tuple() settings min_replicated_logs_to_keep=1, max_replicated_logs_to_keep=2, cleanup_delay_period=0, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0, old_parts_lifetime=0, max_parts_to_merge_at_once=4, - merge_selecting_sleep_ms=100, max_merge_selecting_sleep_ms=500; + merge_selecting_sleep_ms=1000, max_merge_selecting_sleep_ms=2000; create table rmt2 (n int) engine=ReplicatedMergeTree('/test/02448/{database}/rmt', '2') order by tuple() settings min_replicated_logs_to_keep=1, max_replicated_logs_to_keep=2, cleanup_delay_period=0, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0, old_parts_lifetime=0, max_parts_to_merge_at_once=4, - merge_selecting_sleep_ms=100, max_merge_selecting_sleep_ms=500; + merge_selecting_sleep_ms=1000, max_merge_selecting_sleep_ms=2000; -- insert part only on one replica system stop replicated sends rmt1; @@ -141,7 +141,10 @@ system sync replica rmt2; -- merge through gap optimize table rmt2; -- give it a chance to cleanup log -select sleep(2) format Null; -- increases probability of reproducing the issue + +select sleepEachRow(2) from url('http://localhost:8123/?param_tries={1..10}&query=' || encodeURLComponent( + 'select value from system.zookeeper where path=''//test/02448/' || currentDatabase() || '/rmt/replicas/1/is_lost'' and value=''1''' + ), 'LineAsString', 's String') settings max_threads=1 format Null; -- rmt1 will mimic rmt2, but will not be able to fetch parts for a while system stop replicated sends rmt2; From 3cdbccd9ea2eefa230d70fb23506fcbe5c01148b Mon Sep 17 00:00:00 2001 From: serxa Date: Wed, 7 Jun 2023 16:50:04 +0000 Subject: [PATCH 0623/1072] Unify priorities: `IExecutableTask`s --- src/Storages/MergeTree/IExecutableTask.h | 5 +++-- src/Storages/MergeTree/MergeFromLogEntryTask.cpp | 2 +- src/Storages/MergeTree/MergeFromLogEntryTask.h | 4 ++-- src/Storages/MergeTree/MergePlainMergeTreeTask.h | 6 +++--- src/Storages/MergeTree/MergeTreeBackgroundExecutor.h | 2 +- src/Storages/MergeTree/MutateFromLogEntryTask.cpp | 2 +- src/Storages/MergeTree/MutateFromLogEntryTask.h | 4 ++-- src/Storages/MergeTree/MutatePlainMergeTreeTask.h | 6 +++--- src/Storages/MergeTree/MutateTask.cpp | 7 ++++--- src/Storages/MergeTree/tests/gtest_executor.cpp | 10 +++++----- 10 files changed, 25 insertions(+), 23 deletions(-) diff --git a/src/Storages/MergeTree/IExecutableTask.h b/src/Storages/MergeTree/IExecutableTask.h index 9617960c182..d0c2d4a840e 100644 --- a/src/Storages/MergeTree/IExecutableTask.h +++ b/src/Storages/MergeTree/IExecutableTask.h @@ -5,6 +5,7 @@ #include #include +#include namespace DB { @@ -32,7 +33,7 @@ public: virtual bool executeStep() = 0; virtual void onCompleted() = 0; virtual StorageID getStorageID() = 0; - virtual UInt64 getPriority() = 0; + virtual Priority getPriority() = 0; virtual ~IExecutableTask() = default; }; @@ -63,7 +64,7 @@ public: void onCompleted() override { job_result_callback(!res); } StorageID getStorageID() override { return id; } - UInt64 getPriority() override + Priority getPriority() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "getPriority() method is not supported by LambdaAdapter"); } diff --git a/src/Storages/MergeTree/MergeFromLogEntryTask.cpp b/src/Storages/MergeTree/MergeFromLogEntryTask.cpp index a5b4a29cb18..5cee402f88c 100644 --- a/src/Storages/MergeTree/MergeFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MergeFromLogEntryTask.cpp @@ -291,7 +291,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MergeFromLogEntryTask::prepare() /// Adjust priority for (auto & item : future_merged_part->parts) - priority += item->getBytesOnDisk(); + priority.value += item->getBytesOnDisk(); return {true, true, [this, stopwatch = *stopwatch_ptr] (const ExecutionStatus & execution_status) { diff --git a/src/Storages/MergeTree/MergeFromLogEntryTask.h b/src/Storages/MergeTree/MergeFromLogEntryTask.h index 9d870cd9a4b..2c559c06d7e 100644 --- a/src/Storages/MergeTree/MergeFromLogEntryTask.h +++ b/src/Storages/MergeTree/MergeFromLogEntryTask.h @@ -22,7 +22,7 @@ public: StorageReplicatedMergeTree & storage_, IExecutableTask::TaskResultCallback & task_result_callback_); - UInt64 getPriority() override { return priority; } + Priority getPriority() override { return priority; } protected: /// Both return false if we can't execute merge. @@ -44,7 +44,7 @@ private: StopwatchUniquePtr stopwatch_ptr{nullptr}; MergeTreeData::MutableDataPartPtr part; - UInt64 priority{0}; + Priority priority; MergeTaskPtr merge_task; }; diff --git a/src/Storages/MergeTree/MergePlainMergeTreeTask.h b/src/Storages/MergeTree/MergePlainMergeTreeTask.h index 369b4390da7..95df8c90c9b 100644 --- a/src/Storages/MergeTree/MergePlainMergeTreeTask.h +++ b/src/Storages/MergeTree/MergePlainMergeTreeTask.h @@ -34,13 +34,13 @@ public: , task_result_callback(task_result_callback_) { for (auto & item : merge_mutate_entry->future_part->parts) - priority += item->getBytesOnDisk(); + priority.value += item->getBytesOnDisk(); } bool executeStep() override; void onCompleted() override; StorageID getStorageID() override; - UInt64 getPriority() override { return priority; } + Priority getPriority() override { return priority; } void setCurrentTransaction(MergeTreeTransactionHolder && txn_holder_, MergeTreeTransactionPtr && txn_) { @@ -77,7 +77,7 @@ private: using MergeListEntryPtr = std::unique_ptr; MergeListEntryPtr merge_list_entry; - UInt64 priority{0}; + Priority priority; std::function write_part_log; std::function transfer_profile_counters_to_initial_query; diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h index 8142e383d0c..552ed32e2d2 100644 --- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h +++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h @@ -63,7 +63,7 @@ struct TaskRuntimeData /// This scenario in not possible in reality. Poco::Event is_done{/*autoreset=*/false}; /// This is equal to task->getPriority() not to do useless virtual calls in comparator - UInt64 priority{0}; + Priority priority; /// By default priority queue will have max element at top static bool comparePtrByPriority(const TaskRuntimeDataPtr & lhs, const TaskRuntimeDataPtr & rhs) diff --git a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp index d0b85ee65b8..42dccef7e6f 100644 --- a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp @@ -184,7 +184,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare() /// Adjust priority for (auto & item : future_mutated_part->parts) - priority += item->getBytesOnDisk(); + priority.value += item->getBytesOnDisk(); return {true, true, [this] (const ExecutionStatus & execution_status) { diff --git a/src/Storages/MergeTree/MutateFromLogEntryTask.h b/src/Storages/MergeTree/MutateFromLogEntryTask.h index 2a2cc308f85..42b4debcbf2 100644 --- a/src/Storages/MergeTree/MutateFromLogEntryTask.h +++ b/src/Storages/MergeTree/MutateFromLogEntryTask.h @@ -27,7 +27,7 @@ public: {} - UInt64 getPriority() override { return priority; } + Priority getPriority() override { return priority; } private: @@ -40,7 +40,7 @@ private: return mutate_task->execute(); } - UInt64 priority{0}; + Priority priority; TableLockHolder table_lock_holder{nullptr}; ReservationSharedPtr reserved_space{nullptr}; diff --git a/src/Storages/MergeTree/MutatePlainMergeTreeTask.h b/src/Storages/MergeTree/MutatePlainMergeTreeTask.h index 823ea6d7a0f..bd03c276256 100644 --- a/src/Storages/MergeTree/MutatePlainMergeTreeTask.h +++ b/src/Storages/MergeTree/MutatePlainMergeTreeTask.h @@ -36,13 +36,13 @@ public: , task_result_callback(task_result_callback_) { for (auto & part : merge_mutate_entry->future_part->parts) - priority += part->getBytesOnDisk(); + priority.value += part->getBytesOnDisk(); } bool executeStep() override; void onCompleted() override; StorageID getStorageID() override; - UInt64 getPriority() override { return priority; } + Priority getPriority() override { return priority; } private: @@ -66,7 +66,7 @@ private: std::unique_ptr stopwatch; MergeTreeData::MutableDataPartPtr new_part; - UInt64 priority{0}; + Priority priority; using MergeListEntryPtr = std::unique_ptr; MergeListEntryPtr merge_list_entry; diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 76096d00641..31b3c249177 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1,5 +1,6 @@ #include +#include "Common/Priority.h" #include #include #include @@ -961,7 +962,7 @@ public: void onCompleted() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } StorageID getStorageID() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } - UInt64 getPriority() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } + Priority getPriority() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } bool executeStep() override { @@ -1283,7 +1284,7 @@ public: void onCompleted() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } StorageID getStorageID() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } - UInt64 getPriority() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } + Priority getPriority() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } bool executeStep() override { @@ -1412,7 +1413,7 @@ public: void onCompleted() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } StorageID getStorageID() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } - UInt64 getPriority() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } + Priority getPriority() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } bool executeStep() override { diff --git a/src/Storages/MergeTree/tests/gtest_executor.cpp b/src/Storages/MergeTree/tests/gtest_executor.cpp index 3a4f147b456..5815b74284a 100644 --- a/src/Storages/MergeTree/tests/gtest_executor.cpp +++ b/src/Storages/MergeTree/tests/gtest_executor.cpp @@ -51,7 +51,7 @@ public: throw std::runtime_error("Unlucky..."); } - UInt64 getPriority() override { return 0; } + Priority getPriority() override { return {}; } private: std::mt19937 generator; @@ -65,11 +65,11 @@ using StepFunc = std::function; class LambdaExecutableTask : public IExecutableTask { public: - explicit LambdaExecutableTask(const String & name_, size_t step_count_, StepFunc step_func_ = {}, UInt64 priority_ = 0) + explicit LambdaExecutableTask(const String & name_, size_t step_count_, StepFunc step_func_ = {}, Int64 priority_value = 0) : name(name_) , step_count(step_count_) , step_func(step_func_) - , priority(priority_) + , priority{priority_value} {} bool executeStep() override @@ -86,13 +86,13 @@ public: void onCompleted() override {} - UInt64 getPriority() override { return priority; } + Priority getPriority() override { return priority; } private: String name; size_t step_count; StepFunc step_func; - UInt64 priority; + Priority priority; }; From 6f6d806f927ba1e2c539a92e78906334bee20ce4 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 6 Jun 2023 15:17:06 +0000 Subject: [PATCH 0624/1072] Upd test test_alter_moving_garbage --- .../configs/config.d/storage_conf.xml | 2 -- .../test_alter_moving_garbage/test.py | 31 +++++++++++++------ 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf.xml b/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf.xml index 659f59a41b2..f6898ed1d7e 100644 --- a/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf.xml +++ b/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf.xml @@ -1,6 +1,4 @@ - - diff --git a/tests/integration/test_alter_moving_garbage/test.py b/tests/integration/test_alter_moving_garbage/test.py index b369c9ad377..4a42b73b8d4 100644 --- a/tests/integration/test_alter_moving_garbage/test.py +++ b/tests/integration/test_alter_moving_garbage/test.py @@ -28,7 +28,7 @@ def cluster(): cluster.shutdown() -def create_table(node, table_name, **additional_settings): +def create_table(node, table_name, additional_settings): settings = { "storage_policy": "two_disks", "old_parts_lifetime": 1, @@ -52,21 +52,32 @@ def create_table(node, table_name, **additional_settings): node.query(create_table_statement) -def test_create_table(cluster): +@pytest.mark.parametrize("allow_remote_fs_zero_copy_replication", [False, True]) +def test_create_table(cluster, allow_remote_fs_zero_copy_replication): node = cluster.instances["node1"] - create_table(node, "test_table") + + additional_settings = {} + table_name = "test_table" + + if allow_remote_fs_zero_copy_replication: + # different names for logs readability + table_name = "test_table_zero_copy" + additional_settings["allow_remote_fs_zero_copy_replication"] = 1 + + create_table(node, table_name, additional_settings) + node.query( - "INSERT INTO test_table SELECT toDate('2021-01-01') + INTERVAL number % 10 DAY, number, toString(sipHash64(number)) FROM numbers(100_000)" + f"INSERT INTO {table_name} SELECT toDate('2021-01-01') + INTERVAL number % 10 DAY, number, toString(sipHash64(number)) FROM numbers(100_000)" ) stop_alter = False def alter(): d = 0 - node.query(f"ALTER TABLE test_table ADD COLUMN col0 String") + node.query(f"ALTER TABLE {table_name} ADD COLUMN col0 String") while not stop_alter: d = d + 1 - node.query(f"DELETE FROM test_table WHERE id < {d}") + node.query(f"DELETE FROM {table_name} WHERE id < {d}") time.sleep(0.1) alter_thread = threading.Thread(target=alter) @@ -76,12 +87,12 @@ def test_create_table(cluster): partition = f"2021-01-{i:02d}" try: node.query( - f"ALTER TABLE test_table MOVE PARTITION '{partition}' TO DISK 's3'", + f"ALTER TABLE {table_name} MOVE PARTITION '{partition}' TO DISK 's3'", ) except QueryRuntimeException as e: - # PART_IS_TEMPORARILY_LOCKED - assert 384 == e.returncode - continue + if "PART_IS_TEMPORARILY_LOCKED" in str(e): + continue + raise e # clear old temporary directories wakes up every 1 second time.sleep(0.5) From 989540e5b1ae4c10605e7609a7906e789ad755a4 Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 7 Jun 2023 17:37:32 +0000 Subject: [PATCH 0625/1072] Upd test_alter_moving_garbage: use replicated engine --- .../configs/config.d/remote_servers.xml | 16 +++ .../test_alter_moving_garbage/test.py | 99 ++++++++++++++----- 2 files changed, 88 insertions(+), 27 deletions(-) create mode 100644 tests/integration/test_alter_moving_garbage/configs/config.d/remote_servers.xml diff --git a/tests/integration/test_alter_moving_garbage/configs/config.d/remote_servers.xml b/tests/integration/test_alter_moving_garbage/configs/config.d/remote_servers.xml new file mode 100644 index 00000000000..45713eaed59 --- /dev/null +++ b/tests/integration/test_alter_moving_garbage/configs/config.d/remote_servers.xml @@ -0,0 +1,16 @@ + + + + + + node1 + 9000 + + + node2 + 9000 + + + + + diff --git a/tests/integration/test_alter_moving_garbage/test.py b/tests/integration/test_alter_moving_garbage/test.py index 4a42b73b8d4..dc3f6c35ead 100644 --- a/tests/integration/test_alter_moving_garbage/test.py +++ b/tests/integration/test_alter_moving_garbage/test.py @@ -3,22 +3,29 @@ import time import pytest import threading +import random from helpers.client import QueryRuntimeException from helpers.cluster import ClickHouseCluster +# two replicas in remote_servers.xml +REPLICA_COUNT = 2 @pytest.fixture(scope="module") def cluster(): try: cluster = ClickHouseCluster(__file__) - cluster.add_instance( - "node1", - main_configs=[ - "configs/config.d/storage_conf.xml", - ], - with_minio=True, - ) + for i in range(1, REPLICA_COUNT + 1): + cluster.add_instance( + f"node{i}", + main_configs=[ + "configs/config.d/storage_conf.xml", + "configs/config.d/remote_servers.xml", + ], + with_minio=True, + with_zookeeper=True, + ) + logging.info("Starting cluster...") cluster.start() logging.info("Cluster started") @@ -28,7 +35,7 @@ def cluster(): cluster.shutdown() -def create_table(node, table_name, additional_settings): +def create_table(node, table_name, replicated, additional_settings): settings = { "storage_policy": "two_disks", "old_parts_lifetime": 1, @@ -38,55 +45,91 @@ def create_table(node, table_name, additional_settings): } settings.update(additional_settings) + table_engine = ( + f"ReplicatedMergeTree('/clickhouse/tables/0/{table_name}', '{node.name}')" + if replicated + else "MergeTree()" + ) + create_table_statement = f""" CREATE TABLE {table_name} ( dt Date, id Int64, data String, INDEX min_max (id) TYPE minmax GRANULARITY 3 - ) ENGINE=MergeTree() + ) ENGINE = {table_engine} PARTITION BY dt ORDER BY (dt, id) SETTINGS {",".join((k+"="+repr(v) for k, v in settings.items()))}""" - node.query(create_table_statement) + if replicated: + node.query_with_retry(create_table_statement) + else: + node.query(create_table_statement) -@pytest.mark.parametrize("allow_remote_fs_zero_copy_replication", [False, True]) -def test_create_table(cluster, allow_remote_fs_zero_copy_replication): - node = cluster.instances["node1"] +@pytest.mark.parametrize( + "allow_remote_fs_zero_copy_replication,replicated_engine", + [(False, False), (False, True), (True, True)], +) +def test_create_table( + cluster, allow_remote_fs_zero_copy_replication, replicated_engine +): + if replicated_engine: + nodes = list(cluster.instances.values()) + else: + nodes = [cluster.instances["node1"]] additional_settings = {} - table_name = "test_table" + # different names for logs readability + table_name = "test_table" if allow_remote_fs_zero_copy_replication: - # different names for logs readability table_name = "test_table_zero_copy" additional_settings["allow_remote_fs_zero_copy_replication"] = 1 + if replicated_engine: + table_name = table_name + "_replicated" - create_table(node, table_name, additional_settings) + for node in nodes: + create_table(node, table_name, replicated_engine, additional_settings) - node.query( - f"INSERT INTO {table_name} SELECT toDate('2021-01-01') + INTERVAL number % 10 DAY, number, toString(sipHash64(number)) FROM numbers(100_000)" - ) + for i in range(1, 11): + partition = f"2021-01-{i:02d}" + random.choice(nodes).query( + f"INSERT INTO {table_name} SELECT toDate('{partition}'), number as id, toString(sipHash64(number, {i})) FROM numbers(10_000)" + ) + + def check_count(): + if replicated_engine: + return random.choice(nodes).query_with_retry( + f"SELECT countDistinct(dt, data) FROM clusterAllReplicas(test_cluster, default.{table_name}) WHERE id % 100 = 0" + ) + else: + return random.choice(nodes).query( + f"SELECT countDistinct(dt, data) FROM {table_name} WHERE id % 100 = 0" + ) + + assert check_count() == "1000\n" stop_alter = False def alter(): - d = 0 - node.query(f"ALTER TABLE {table_name} ADD COLUMN col0 String") - while not stop_alter: - d = d + 1 - node.query(f"DELETE FROM {table_name} WHERE id < {d}") + random.choice(nodes).query(f"ALTER TABLE {table_name} ADD COLUMN col0 String") + for d in range(1, 100): + if stop_alter: + break + # I managed to reproduce issue with DELETE, but it can be any other lightweight mutation + # Do not delete rows with id % 100 = 0, because they are used in check_count to check that data is not corrupted + random.choice(nodes).query(f"DELETE FROM {table_name} WHERE id % 100 = {d}") time.sleep(0.1) alter_thread = threading.Thread(target=alter) alter_thread.start() - for i in range(1, 10): + for i in range(1, 11): partition = f"2021-01-{i:02d}" try: - node.query( + random.choice(nodes).query( f"ALTER TABLE {table_name} MOVE PARTITION '{partition}' TO DISK 's3'", ) except QueryRuntimeException as e: @@ -94,8 +137,10 @@ def test_create_table(cluster, allow_remote_fs_zero_copy_replication): continue raise e - # clear old temporary directories wakes up every 1 second + # Function to clear old temporary directories wakes up every 1 second, sleep to make sure it is called time.sleep(0.5) stop_alter = True alter_thread.join() + + assert check_count() == "1000\n" From e1ceb01ad0b8988ec70818472740fdeff5acf112 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 7 Jun 2023 19:45:27 +0200 Subject: [PATCH 0626/1072] Parallel replicas for short queries (#50639) --- src/Core/Settings.h | 1 + src/Storages/MergeTree/MergeTreeData.cpp | 39 +++++++++++++++++++ src/Storages/MergeTree/MergeTreeData.h | 7 ++++ tests/broken_tests.txt | 2 +- ...lel_replicas_automatic_disabling.reference | 2 + ..._parallel_replicas_automatic_disabling.sql | 15 +++++++ 6 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02784_parallel_replicas_automatic_disabling.reference create mode 100644 tests/queries/0_stateless/02784_parallel_replicas_automatic_disabling.sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 0037acedede..a87e321bed2 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -160,6 +160,7 @@ class IColumn; M(UInt64, allow_experimental_parallel_reading_from_replicas, 0, "Use all the replicas from a shard for SELECT query execution. Reading is parallelized and coordinated dynamically. 0 - disabled, 1 - enabled, silently disable them in case of failure, 2 - enabled, throw an exception in case of failure", 0) \ M(Float, parallel_replicas_single_task_marks_count_multiplier, 2, "A multiplier which will be added during calculation for minimal number of marks to retrieve from coordinator. This will be applied only for remote replicas.", 0) \ M(Bool, parallel_replicas_for_non_replicated_merge_tree, false, "If true, ClickHouse will use parallel replicas algorithm also for non-replicated MergeTree tables", 0) \ + M(UInt64, parallel_replicas_min_number_of_granules_to_enable, 0, "If the number of marks to read is less than the value of this setting - parallel replicas will be disabled", 0) \ \ M(Bool, skip_unavailable_shards, false, "If true, ClickHouse silently skips unavailable shards and nodes unresolvable through DNS. Shard is marked as unavailable when none of the replicas can be reached.", 0) \ \ diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index e806e1bb93f..2f6870f8b41 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -7154,6 +7154,9 @@ QueryProcessingStage::Enum MergeTreeData::getQueryProcessingStage( /// Parallel replicas if (query_context->canUseParallelReplicasOnInitiator() && to_stage >= QueryProcessingStage::WithMergeableState) { + if (!canUseParallelReplicasBasedOnPKAnalysis(query_context, storage_snapshot, query_info)) + return QueryProcessingStage::Enum::FetchColumns; + /// ReplicatedMergeTree if (supportsReplication()) return QueryProcessingStage::Enum::WithMergeableState; @@ -7179,6 +7182,42 @@ QueryProcessingStage::Enum MergeTreeData::getQueryProcessingStage( } +bool MergeTreeData::canUseParallelReplicasBasedOnPKAnalysis( + ContextPtr query_context, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & query_info) const +{ + const auto & snapshot_data = assert_cast(*storage_snapshot->data); + const auto & parts = snapshot_data.parts; + + MergeTreeDataSelectExecutor reader(*this); + auto result_ptr = reader.estimateNumMarksToRead( + parts, + query_info.prewhere_info, + storage_snapshot->getMetadataForQuery()->getColumns().getAll().getNames(), + storage_snapshot->metadata, + storage_snapshot->metadata, + query_info, + /*added_filter_nodes*/ActionDAGNodes{}, + query_context, + query_context->getSettingsRef().max_threads); + + if (result_ptr->error()) + std::rethrow_exception(std::get(result_ptr->result)); + + LOG_TRACE(log, "Estimated number of granules to read is {}", result_ptr->marks()); + + bool decision = result_ptr->marks() >= query_context->getSettingsRef().parallel_replicas_min_number_of_granules_to_enable; + + if (!decision) + LOG_DEBUG(log, "Parallel replicas will be disabled, because the estimated number of granules to read {} is less than the threshold which is {}", + result_ptr->marks(), + query_context->getSettingsRef().parallel_replicas_min_number_of_granules_to_enable); + + return decision; +} + + MergeTreeData & MergeTreeData::checkStructureAndGetMergeTreeData(IStorage & source_table, const StorageMetadataPtr & src_snapshot, const StorageMetadataPtr & my_snapshot) const { MergeTreeData * src_data = dynamic_cast(&source_table); diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 2f254f9a787..b1e1e43bd0b 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -1536,6 +1536,13 @@ private: static MutableDataPartPtr asMutableDeletingPart(const DataPartPtr & part); mutable TemporaryParts temporary_parts; + + /// Estimate the number of marks to read to make a decision whether to enable parallel replicas (distributed processing) or not + /// Note: it could be very rough. + bool canUseParallelReplicasBasedOnPKAnalysis( + ContextPtr query_context, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & query_info) const; }; /// RAII struct to record big parts that are submerging or emerging. diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index 02935712325..7ee497973b8 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -137,4 +137,4 @@ 02534_s3_cluster_insert_select_schema_inference 02764_parallel_replicas_plain_merge_tree 02765_parallel_replicas_final_modifier - +02784_parallel_replicas_automatic_disabling diff --git a/tests/queries/0_stateless/02784_parallel_replicas_automatic_disabling.reference b/tests/queries/0_stateless/02784_parallel_replicas_automatic_disabling.reference new file mode 100644 index 00000000000..af81158ecae --- /dev/null +++ b/tests/queries/0_stateless/02784_parallel_replicas_automatic_disabling.reference @@ -0,0 +1,2 @@ +10 +1 diff --git a/tests/queries/0_stateless/02784_parallel_replicas_automatic_disabling.sql b/tests/queries/0_stateless/02784_parallel_replicas_automatic_disabling.sql new file mode 100644 index 00000000000..b2f674ddb64 --- /dev/null +++ b/tests/queries/0_stateless/02784_parallel_replicas_automatic_disabling.sql @@ -0,0 +1,15 @@ +DROP TABLE IF EXISTS test_parallel_replicas_automatic_disabling; +CREATE TABLE test_parallel_replicas_automatic_disabling (n UInt64) ENGINE=MergeTree() ORDER BY tuple(); +INSERT INTO test_parallel_replicas_automatic_disabling SELECT * FROM numbers(10); + +SYSTEM FLUSH LOGS; + +SET skip_unavailable_shards=1, allow_experimental_parallel_reading_from_replicas=1, max_parallel_replicas=3, use_hedged_requests=0, cluster_for_parallel_replicas='parallel_replicas', parallel_replicas_for_non_replicated_merge_tree=1, parallel_replicas_min_number_of_granules_to_enable=10000; +SET send_logs_level='error'; +SELECT count() FROM test_parallel_replicas_automatic_disabling WHERE NOT ignore(*); + +SYSTEM FLUSH LOGS; + +SELECT count() > 0 FROM system.text_log WHERE event_time >= now() - INTERVAL 2 MINUTE AND message LIKE '%Parallel replicas will be disabled, because the estimated number of granules to read%'; + +DROP TABLE test_parallel_replicas_automatic_disabling; From c0f2141bd0432c7ebcab5ee0ef033141194fd59d Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 7 Jun 2023 19:51:41 +0200 Subject: [PATCH 0627/1072] Revert "date_trunc function to always return DateTime type" (#50670) --- src/Functions/date_trunc.cpp | 98 ++++++++----------- .../00189_time_zones_long.reference | 56 +++++------ ...21_datetime64_compatibility_long.reference | 8 +- 3 files changed, 71 insertions(+), 91 deletions(-) diff --git a/src/Functions/date_trunc.cpp b/src/Functions/date_trunc.cpp index 87fff0b7f3c..016b8f4da5e 100644 --- a/src/Functions/date_trunc.cpp +++ b/src/Functions/date_trunc.cpp @@ -1,6 +1,6 @@ #include -#include #include +#include #include #include #include @@ -25,7 +25,7 @@ class FunctionDateTrunc : public IFunction public: static constexpr auto name = "dateTrunc"; - explicit FunctionDateTrunc(ContextPtr context_) : context(context_) { } + explicit FunctionDateTrunc(ContextPtr context_) : context(context_) {} static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } @@ -39,58 +39,51 @@ public: { /// The first argument is a constant string with the name of datepart. - intermediate_type_is_date = false; + auto result_type_is_date = false; String datepart_param; - auto check_first_argument = [&] - { + auto check_first_argument = [&] { const ColumnConst * datepart_column = checkAndGetColumnConst(arguments[0].column.get()); if (!datepart_column) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "First argument for function {} must be constant string: " - "name of datepart", - getName()); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be constant string: " + "name of datepart", getName()); datepart_param = datepart_column->getValue(); if (datepart_param.empty()) - throw Exception( - ErrorCodes::BAD_ARGUMENTS, "First argument (name of datepart) for function {} cannot be empty", getName()); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "First argument (name of datepart) for function {} cannot be empty", + getName()); if (!IntervalKind::tryParseString(datepart_param, datepart_kind)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "{} doesn't look like datepart name in {}", datepart_param, getName()); - intermediate_type_is_date = (datepart_kind == IntervalKind::Year) || (datepart_kind == IntervalKind::Quarter) - || (datepart_kind == IntervalKind::Month) || (datepart_kind == IntervalKind::Week); + result_type_is_date = (datepart_kind == IntervalKind::Year) + || (datepart_kind == IntervalKind::Quarter) || (datepart_kind == IntervalKind::Month) + || (datepart_kind == IntervalKind::Week); }; bool second_argument_is_date = false; - auto check_second_argument = [&] - { + auto check_second_argument = [&] { if (!isDate(arguments[1].type) && !isDateTime(arguments[1].type) && !isDateTime64(arguments[1].type)) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of 2nd argument of function {}. " - "Should be a date or a date with time", - arguments[1].type->getName(), - getName()); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of 2nd argument of function {}. " + "Should be a date or a date with time", arguments[1].type->getName(), getName()); second_argument_is_date = isDate(arguments[1].type); - if (second_argument_is_date - && ((datepart_kind == IntervalKind::Hour) || (datepart_kind == IntervalKind::Minute) - || (datepart_kind == IntervalKind::Second))) + if (second_argument_is_date && ((datepart_kind == IntervalKind::Hour) + || (datepart_kind == IntervalKind::Minute) || (datepart_kind == IntervalKind::Second))) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type Date of argument for function {}", getName()); }; - auto check_timezone_argument = [&] - { + auto check_timezone_argument = [&] { if (!WhichDataType(arguments[2].type).isString()) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of argument of function {}. " + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}. " "This argument is optional and must be a constant string with timezone name", - arguments[2].type->getName(), - getName()); + arguments[2].type->getName(), getName()); + + if (second_argument_is_date && result_type_is_date) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "The timezone argument of function {} with datepart '{}' " + "is allowed only when the 2nd argument has the type DateTime", + getName(), datepart_param); }; if (arguments.size() == 2) @@ -106,14 +99,15 @@ public: } else { - throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Number of arguments for function {} doesn't match: passed {}, should be 2 or 3", - getName(), - arguments.size()); + getName(), arguments.size()); } - return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 2, 1)); + if (result_type_is_date) + return std::make_shared(); + else + return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 2, 1)); } bool useDefaultImplementationForConstants() const override { return true; } @@ -130,40 +124,26 @@ public: auto to_start_of_interval = FunctionFactory::instance().get("toStartOfInterval", context); - ColumnPtr truncated_column; - auto date_type = std::make_shared(); - if (arguments.size() == 2) - truncated_column = to_start_of_interval->build(temp_columns) - ->execute(temp_columns, intermediate_type_is_date ? date_type : result_type, input_rows_count); - else - { - temp_columns[2] = arguments[2]; - truncated_column = to_start_of_interval->build(temp_columns) - ->execute(temp_columns, intermediate_type_is_date ? date_type : result_type, input_rows_count); - } + return to_start_of_interval->build(temp_columns)->execute(temp_columns, result_type, input_rows_count); - if (!intermediate_type_is_date) - return truncated_column; - - ColumnsWithTypeAndName temp_truncated_column(1); - temp_truncated_column[0] = {truncated_column, date_type, ""}; - - auto to_date_time_or_default = FunctionFactory::instance().get("toDateTime", context); - return to_date_time_or_default->build(temp_truncated_column)->execute(temp_truncated_column, result_type, input_rows_count); + temp_columns[2] = arguments[2]; + return to_start_of_interval->build(temp_columns)->execute(temp_columns, result_type, input_rows_count); } - bool hasInformationAboutMonotonicity() const override { return true; } + bool hasInformationAboutMonotonicity() const override + { + return true; + } Monotonicity getMonotonicityForRange(const IDataType &, const Field &, const Field &) const override { - return {.is_monotonic = true, .is_always_monotonic = true}; + return { .is_monotonic = true, .is_always_monotonic = true }; } private: ContextPtr context; mutable IntervalKind::Kind datepart_kind = IntervalKind::Kind::Second; - mutable bool intermediate_type_is_date = false; }; } diff --git a/tests/queries/0_stateless/00189_time_zones_long.reference b/tests/queries/0_stateless/00189_time_zones_long.reference index a4287217a19..7d2ad3c8a01 100644 --- a/tests/queries/0_stateless/00189_time_zones_long.reference +++ b/tests/queries/0_stateless/00189_time_zones_long.reference @@ -258,18 +258,18 @@ toUnixTimestamp 1426415400 1426415400 date_trunc -2019-01-01 00:00:00 -2020-01-01 00:00:00 -2020-01-01 00:00:00 -2019-10-01 00:00:00 -2020-01-01 00:00:00 -2020-01-01 00:00:00 -2019-12-01 00:00:00 -2020-01-01 00:00:00 -2020-01-01 00:00:00 -2019-12-30 00:00:00 -2019-12-30 00:00:00 -2019-12-30 00:00:00 +2019-01-01 +2020-01-01 +2020-01-01 +2019-10-01 +2020-01-01 +2020-01-01 +2019-12-01 +2020-01-01 +2020-01-01 +2019-12-30 +2019-12-30 +2019-12-30 2019-12-31 00:00:00 2020-01-01 00:00:00 2020-01-02 00:00:00 @@ -282,18 +282,18 @@ date_trunc 2019-12-31 20:11:22 2020-01-01 12:11:22 2020-01-02 05:11:22 -2019-01-01 00:00:00 -2020-01-01 00:00:00 -2020-01-01 00:00:00 -2019-10-01 00:00:00 -2020-01-01 00:00:00 -2020-01-01 00:00:00 -2019-12-01 00:00:00 -2020-01-01 00:00:00 -2020-01-01 00:00:00 -2019-12-30 00:00:00 -2019-12-30 00:00:00 -2019-12-30 00:00:00 +2019-01-01 +2020-01-01 +2020-01-01 +2019-10-01 +2020-01-01 +2020-01-01 +2019-12-01 +2020-01-01 +2020-01-01 +2019-12-30 +2019-12-30 +2019-12-30 2019-12-31 00:00:00 2020-01-01 00:00:00 2020-01-02 00:00:00 @@ -306,8 +306,8 @@ date_trunc 2019-12-31 20:11:22 2020-01-01 12:11:22 2020-01-02 05:11:22 -2020-01-01 00:00:00 -2020-01-01 00:00:00 -2020-01-01 00:00:00 -2019-12-30 00:00:00 +2020-01-01 +2020-01-01 +2020-01-01 +2019-12-30 2020-01-01 00:00:00 diff --git a/tests/queries/0_stateless/00921_datetime64_compatibility_long.reference b/tests/queries/0_stateless/00921_datetime64_compatibility_long.reference index 2f56230db37..a946a114bf4 100644 --- a/tests/queries/0_stateless/00921_datetime64_compatibility_long.reference +++ b/tests/queries/0_stateless/00921_datetime64_compatibility_long.reference @@ -135,13 +135,13 @@ Code: 43 ------------------------------------------ SELECT date_trunc(\'year\', N, \'Asia/Istanbul\') Code: 43 -"DateTime('Asia/Istanbul')","2019-01-01 00:00:00" -"DateTime('Asia/Istanbul')","2019-01-01 00:00:00" +"Date","2019-01-01" +"Date","2019-01-01" ------------------------------------------ SELECT date_trunc(\'month\', N, \'Asia/Istanbul\') Code: 43 -"DateTime('Asia/Istanbul')","2019-09-01 00:00:00" -"DateTime('Asia/Istanbul')","2019-09-01 00:00:00" +"Date","2019-09-01" +"Date","2019-09-01" ------------------------------------------ SELECT date_trunc(\'day\', N, \'Asia/Istanbul\') "DateTime('Asia/Istanbul')","2019-09-16 00:00:00" From 7d5b98f0288fdebedf1984fce168260023448676 Mon Sep 17 00:00:00 2001 From: Alexander Sapin Date: Wed, 7 Jun 2023 20:03:17 +0200 Subject: [PATCH 0628/1072] Remove logging add sleeps --- src/Storages/StorageAzureBlob.cpp | 8 -------- src/TableFunctions/TableFunctionAzureBlobStorage.cpp | 2 -- tests/integration/test_storage_azure_blob_storage/test.py | 1 + 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 17374ba2d92..4901f6701fb 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -124,8 +124,6 @@ void StorageAzureBlob::processNamedCollectionResult(StorageAzureBlob::Configurat StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file) { - LOG_INFO(&Poco::Logger::get("StorageAzureBlob"), "get_format_from_file = {}", get_format_from_file); - StorageAzureBlob::Configuration configuration; /// Supported signatures: @@ -857,7 +855,6 @@ StorageAzureBlobSource::Iterator::Iterator( } else { - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "GLOBS BRANCH"); const String key_prefix = blob_path_with_globs->substr(0, blob_path_with_globs->find_first_of("*?{")); /// We don't have to list bucket, because there is no asterisks. @@ -870,11 +867,8 @@ StorageAzureBlobSource::Iterator::Iterator( return; } - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "KEY PREFIX {}", key_prefix); object_storage_iterator = object_storage->iterate(key_prefix); - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "BLOBS BLOBS{}", *blob_path_with_globs); - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "REGEXP PATTERN {}", makeRegexpPatternFromGlobs(*blob_path_with_globs)); matcher = std::make_unique(makeRegexpPatternFromGlobs(*blob_path_with_globs)); if (!matcher->ok()) @@ -1105,7 +1099,6 @@ String StorageAzureBlobSource::getName() const StorageAzureBlobSource::ReaderHolder StorageAzureBlobSource::createReader() { auto [current_key, info] = file_iterator->next(); - LOG_DEBUG(log, "KEY {} SIZE {}", current_key, info.size_bytes); if (current_key.empty()) return {}; @@ -1118,7 +1111,6 @@ StorageAzureBlobSource::ReaderHolder StorageAzureBlobSource::createReader() format_settings, std::nullopt, std::nullopt, /* is_remote_fs */ true, compression_method); - LOG_DEBUG(log, "FORMAT {}", format); QueryPipelineBuilder builder; builder.init(Pipe(input_format)); diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp index 986ad07fdde..265092ddefa 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp @@ -78,7 +78,6 @@ StorageAzureBlob::Configuration TableFunctionAzureBlobStorage::parseArgumentsImp std::unordered_map engine_args_to_idx; configuration.connection_url = checkAndGetLiteralArgument(engine_args[0], "connection_string/storage_account_url"); - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "CONFIGURATION {}", configuration.connection_url); configuration.is_connection_string = isConnectionString(configuration.connection_url); configuration.container = checkAndGetLiteralArgument(engine_args[1], "container"); @@ -194,7 +193,6 @@ void TableFunctionAzureBlobStorage::parseArguments(const ASTPtr & ast_function, auto & args = args_func.at(0)->children; configuration = parseArgumentsImpl(args, context); - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "CONFIGURATION {}", configuration.connection_url); } ColumnsDescription TableFunctionAzureBlobStorage::getActualTableStructure(ContextPtr context) const diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 319500e6226..a7ca049e809 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -47,6 +47,7 @@ def azure_query(node, query, try_num=3, settings={}): if error in str(ex): retry = True print(f"Try num: {i}. Having retriable error: {ex}") + time.sleep(i) break if not retry or i == try_num - 1: raise Exception(ex) From f3b5a87a66baa6ffed9cba8caa52e4c4a63cfc3d Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 6 Jun 2023 19:03:23 +0000 Subject: [PATCH 0629/1072] Fixing crash in anti/semi join --- src/Interpreters/HashJoin.cpp | 12 ++-- src/Interpreters/TableJoin.h | 15 ++++- src/Planner/PlannerJoinTree.cpp | 7 ++- .../02771_semi_join_use_nulls.reference | 0 .../02771_semi_join_use_nulls.sql.j2 | 60 +++++++++++++++++++ 5 files changed, 85 insertions(+), 9 deletions(-) create mode 100644 tests/queries/0_stateless/02771_semi_join_use_nulls.reference create mode 100644 tests/queries/0_stateless/02771_semi_join_use_nulls.sql.j2 diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 9306c9b99eb..191dd74928d 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -679,7 +679,7 @@ void HashJoin::initRightBlockStructure(Block & saved_block_sample) /// We could remove key columns for LEFT | INNER HashJoin but we should keep them for JoinSwitcher (if any). bool save_key_columns = table_join->isEnabledAlgorithm(JoinAlgorithm::AUTO) || table_join->isEnabledAlgorithm(JoinAlgorithm::GRACE_HASH) || - isRightOrFull(kind) || + isRightOrFull(kind) || strictness == JoinStrictness::Semi || strictness == JoinStrictness::Anti || multiple_disjuncts; if (save_key_columns) { @@ -707,8 +707,9 @@ Block HashJoin::prepareRightBlock(const Block & block, const Block & saved_block for (const auto & sample_column : saved_block_sample_.getColumnsWithTypeAndName()) { ColumnWithTypeAndName column = block.getByName(sample_column.name); - if (sample_column.column->isNullable()) - JoinCommon::convertColumnToNullable(column); + + /// There's no optimization for right side const columns. Remove constness if any. + column.column = recursiveRemoveSparse(column.column->convertToFullColumnIfConst()); if (column.column->lowCardinality() && !sample_column.column->lowCardinality()) { @@ -716,8 +717,9 @@ Block HashJoin::prepareRightBlock(const Block & block, const Block & saved_block column.type = removeLowCardinality(column.type); } - /// There's no optimization for right side const columns. Remove constness if any. - column.column = recursiveRemoveSparse(column.column->convertToFullColumnIfConst()); + if (sample_column.column->isNullable()) + JoinCommon::convertColumnToNullable(column); + structured_block.insert(std::move(column)); } diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index 0e0c905e30c..4a020684793 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -232,8 +232,19 @@ public: bool allowParallelHashJoin() const; bool joinUseNulls() const { return join_use_nulls; } - bool forceNullableRight() const { return join_use_nulls && isLeftOrFull(kind()); } - bool forceNullableLeft() const { return join_use_nulls && isRightOrFull(kind()); } + + /// Join use nulls doen't make sense for semi and anti joins + /// Only columns from corresponding table should be used, values in other table are undefined. + bool forceNullableRight() const + { + return join_use_nulls && isLeftOrFull(kind()) && strictness() != JoinStrictness::Semi && strictness() != JoinStrictness::Anti; + } + + bool forceNullableLeft() const + { + return join_use_nulls && isRightOrFull(kind()) && strictness() != JoinStrictness::Semi && strictness() != JoinStrictness::Anti; + } + size_t defaultMaxBytes() const { return default_max_bytes; } size_t maxJoinedBlockRows() const { return max_joined_block_rows; } size_t maxRowsInRightBlock() const { return partial_merge_join_rows_in_right_blocks; } diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index 9672738ae6b..c0c6d301d88 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -873,10 +873,11 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_ JoinClausesAndActions join_clauses_and_actions; JoinKind join_kind = join_node.getKind(); + JoinStrictness join_strictness = join_node.getStrictness(); std::optional join_constant; - if (join_node.getStrictness() == JoinStrictness::All) + if (join_strictness == JoinStrictness::All) join_constant = tryExtractConstantFromJoinNode(join_table_expression); if (join_constant) @@ -996,7 +997,9 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_ plan_to_add_cast.addStep(std::move(cast_join_columns_step)); }; - if (join_use_nulls) + /// Join use nulls doen't make sense for semi and anti joins + /// Only columns from corresponding table should be used, values in other table are undefined. + if (join_use_nulls && join_strictness != JoinStrictness::Semi && join_strictness != JoinStrictness::Anti) { if (isFull(join_kind)) { diff --git a/tests/queries/0_stateless/02771_semi_join_use_nulls.reference b/tests/queries/0_stateless/02771_semi_join_use_nulls.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02771_semi_join_use_nulls.sql.j2 b/tests/queries/0_stateless/02771_semi_join_use_nulls.sql.j2 new file mode 100644 index 00000000000..308df9a6094 --- /dev/null +++ b/tests/queries/0_stateless/02771_semi_join_use_nulls.sql.j2 @@ -0,0 +1,60 @@ + +{% for allow_experimental_analyzer in [0, 1] -%} +{% for join_use_nulls in [0, 1] -%} +{% for kind in ['LEFT', 'RIGHT'] -%} +{% for strictness in ['SEMI', 'ANTI'] -%} +{% for maybe_materialize in ['', 'materialize'] -%} + +SET allow_experimental_analyzer = {{ allow_experimental_analyzer }}; + +SET join_use_nulls = {{ join_use_nulls }}; + +-- FORMAT Null is used because one side is undefined (e.g. for SEMI LEFT only columns from the left side contain values) + +SELECT id > 1, d.id FROM (SELECT {{ maybe_materialize }}(toLowCardinality(0)) AS id, 1 AS value) AS a +{{ strictness }} {{ kind }} JOIN (SELECT {{ maybe_materialize }}(toLowCardinality(0)) AS id) AS d +USING (id) +FORMAT Null +; + +SELECT id > 1, d.id FROM (SELECT {{ maybe_materialize }}(toLowCardinality(toNullable(0))) AS id, 1 AS value) AS a +{{ strictness }} {{ kind }} JOIN (SELECT {{ maybe_materialize }}(toLowCardinality(0)) AS id) AS d +USING (id) +FORMAT Null +; + +SELECT id > 1, d.id FROM (SELECT {{ maybe_materialize }}(toLowCardinality(0)) AS id, 1 AS value) AS a +{{ strictness }} {{ kind }} JOIN (SELECT {{ maybe_materialize }}(toLowCardinality(toNullable(0))) AS id) AS d +USING (id) +FORMAT Null +; + +SELECT id > 1, d.id FROM (SELECT {{ maybe_materialize }}(toLowCardinality(toNullable(0))) AS id, 1 AS value) AS a +{{ strictness }} {{ kind }} JOIN (SELECT {{ maybe_materialize }}(toLowCardinality(toNullable(0))) AS id) AS d +USING (id) +FORMAT Null +; + +SELECT id > 1, d.id FROM (SELECT {{ maybe_materialize }}(toLowCardinality(1)) AS id, 1 AS value) AS a +{{ strictness }} {{ kind }} JOIN (SELECT {{ maybe_materialize }}(toLowCardinality(2)) AS id) AS d +USING (id) +FORMAT Null +; + +SELECT id > 1, d.id FROM (SELECT {{ maybe_materialize }}(toLowCardinality(0)) AS id, 1 AS value) AS a +{{ strictness }} {{ kind }} JOIN (SELECT {{ maybe_materialize }}(toLowCardinality(0)) AS id) AS d +ON a.id = d.id +FORMAT Null +; + +SELECT id > 1, d.idd FROM (SELECT {{ maybe_materialize }}(toLowCardinality(0)) AS id, 1 AS value) AS a +{{ strictness }} {{ kind }} JOIN (SELECT {{ maybe_materialize }}(toLowCardinality(0)) AS idd) AS d +ON a.id = d.idd +FORMAT Null +; + +{% endfor -%} +{% endfor -%} +{% endfor -%} +{% endfor -%} +{% endfor -%} From a268250aff8fad35828641d6242c9d6d0e9ee3a3 Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 7 Jun 2023 13:25:06 +0000 Subject: [PATCH 0630/1072] Fixes for LowCardinality Nullable in HashJoin --- src/Columns/ColumnLowCardinality.cpp | 5 + src/Columns/tests/gtest_low_cardinality.cpp | 13 ++ src/Interpreters/HashJoin.cpp | 2 +- src/Interpreters/JoinUtils.cpp | 18 +-- src/Interpreters/TableJoin.h | 6 +- src/Planner/PlannerJoinTree.cpp | 4 +- .../02771_semi_join_use_nulls.reference | 112 ++++++++++++++++++ .../02771_semi_join_use_nulls.sql.j2 | 9 -- 8 files changed, 140 insertions(+), 29 deletions(-) diff --git a/src/Columns/ColumnLowCardinality.cpp b/src/Columns/ColumnLowCardinality.cpp index 4f9ab8215be..9269ea4ee4d 100644 --- a/src/Columns/ColumnLowCardinality.cpp +++ b/src/Columns/ColumnLowCardinality.cpp @@ -313,6 +313,11 @@ MutableColumnPtr ColumnLowCardinality::cloneResized(size_t size) const MutableColumnPtr ColumnLowCardinality::cloneNullable() const { auto res = cloneFinalized(); + /* Compact required not to share dictionary. + * If `shared` flag is not set `cloneFinalized` will return shallow copy + * and `nestedToNullable` will mutate source column. + */ + assert_cast(*res).compactInplace(); assert_cast(*res).nestedToNullable(); return res; } diff --git a/src/Columns/tests/gtest_low_cardinality.cpp b/src/Columns/tests/gtest_low_cardinality.cpp index 3ffc88f6a7d..5e01279b7df 100644 --- a/src/Columns/tests/gtest_low_cardinality.cpp +++ b/src/Columns/tests/gtest_low_cardinality.cpp @@ -48,3 +48,16 @@ TEST(ColumnLowCardinality, Insert) testLowCardinalityNumberInsert(std::make_shared()); testLowCardinalityNumberInsert(std::make_shared()); } + +TEST(ColumnLowCardinality, Clone) +{ + auto data_type = std::make_shared(); + auto low_cardinality_type = std::make_shared(data_type); + auto column = low_cardinality_type->createColumn(); + ASSERT_FALSE(assert_cast(*column).nestedIsNullable()); + + auto nullable_column = assert_cast(*column).cloneNullable(); + + ASSERT_TRUE(assert_cast(*nullable_column).nestedIsNullable()); + ASSERT_FALSE(assert_cast(*column).nestedIsNullable()); +} diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 191dd74928d..c4d1615a119 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -679,7 +679,7 @@ void HashJoin::initRightBlockStructure(Block & saved_block_sample) /// We could remove key columns for LEFT | INNER HashJoin but we should keep them for JoinSwitcher (if any). bool save_key_columns = table_join->isEnabledAlgorithm(JoinAlgorithm::AUTO) || table_join->isEnabledAlgorithm(JoinAlgorithm::GRACE_HASH) || - isRightOrFull(kind) || strictness == JoinStrictness::Semi || strictness == JoinStrictness::Anti || + isRightOrFull(kind) || multiple_disjuncts; if (save_key_columns) { diff --git a/src/Interpreters/JoinUtils.cpp b/src/Interpreters/JoinUtils.cpp index a05b58e14a1..67ee2a64264 100644 --- a/src/Interpreters/JoinUtils.cpp +++ b/src/Interpreters/JoinUtils.cpp @@ -160,16 +160,14 @@ static ColumnPtr tryConvertColumnToNullable(ColumnPtr col) if (col->lowCardinality()) { - auto mut_col = IColumn::mutate(std::move(col)); - ColumnLowCardinality * col_lc = assert_cast(mut_col.get()); - if (col_lc->nestedIsNullable()) + const ColumnLowCardinality & col_lc = assert_cast(*col); + if (col_lc.nestedIsNullable()) { - return mut_col; + return col; } - else if (col_lc->nestedCanBeInsideNullable()) + else if (col_lc.nestedCanBeInsideNullable()) { - col_lc->nestedToNullable(); - return mut_col; + return col_lc.cloneNullable(); } } else if (const ColumnConst * col_const = checkAndGetColumn(*col)) @@ -232,11 +230,7 @@ void removeColumnNullability(ColumnWithTypeAndName & column) if (column.column && column.column->lowCardinality()) { - auto mut_col = IColumn::mutate(std::move(column.column)); - ColumnLowCardinality * col_as_lc = typeid_cast(mut_col.get()); - if (col_as_lc && col_as_lc->nestedIsNullable()) - col_as_lc->nestedRemoveNullable(); - column.column = std::move(mut_col); + column.column = assert_cast(column.column.get())->cloneWithDefaultOnNull(); } } else diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index 4a020684793..6737cd8f13a 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -233,16 +233,14 @@ public: bool joinUseNulls() const { return join_use_nulls; } - /// Join use nulls doen't make sense for semi and anti joins - /// Only columns from corresponding table should be used, values in other table are undefined. bool forceNullableRight() const { - return join_use_nulls && isLeftOrFull(kind()) && strictness() != JoinStrictness::Semi && strictness() != JoinStrictness::Anti; + return join_use_nulls && isLeftOrFull(kind()); } bool forceNullableLeft() const { - return join_use_nulls && isRightOrFull(kind()) && strictness() != JoinStrictness::Semi && strictness() != JoinStrictness::Anti; + return join_use_nulls && isRightOrFull(kind()); } size_t defaultMaxBytes() const { return default_max_bytes; } diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index c0c6d301d88..d875cc38bce 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -997,9 +997,7 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_ plan_to_add_cast.addStep(std::move(cast_join_columns_step)); }; - /// Join use nulls doen't make sense for semi and anti joins - /// Only columns from corresponding table should be used, values in other table are undefined. - if (join_use_nulls && join_strictness != JoinStrictness::Semi && join_strictness != JoinStrictness::Anti) + if (join_use_nulls) { if (isFull(join_kind)) { diff --git a/tests/queries/0_stateless/02771_semi_join_use_nulls.reference b/tests/queries/0_stateless/02771_semi_join_use_nulls.reference index e69de29bb2d..8d4b1a3a75e 100644 --- a/tests/queries/0_stateless/02771_semi_join_use_nulls.reference +++ b/tests/queries/0_stateless/02771_semi_join_use_nulls.reference @@ -0,0 +1,112 @@ +0 0 +0 0 +0 \N +0 0 +0 0 +0 0 +0 0 +0 0 +0 \N +0 0 +0 0 +0 0 +0 1 +0 1 +0 0 +0 0 +0 \N +0 0 +0 0 +0 0 +0 0 +0 0 +0 \N +0 0 +0 0 +0 0 +0 2 +0 2 +0 \N +0 0 +0 \N +0 0 +0 \N +0 \N +0 \N +0 0 +0 \N +0 0 +0 \N +0 \N +0 1 +0 1 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +\N 2 +\N 2 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 1 +0 1 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +1 2 +1 2 +0 \N +0 0 +0 0 +0 0 +0 \N +0 \N +0 \N +0 0 +0 0 +0 0 +0 \N +0 \N +0 1 +0 1 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +1 2 +1 2 diff --git a/tests/queries/0_stateless/02771_semi_join_use_nulls.sql.j2 b/tests/queries/0_stateless/02771_semi_join_use_nulls.sql.j2 index 308df9a6094..37b2e63761b 100644 --- a/tests/queries/0_stateless/02771_semi_join_use_nulls.sql.j2 +++ b/tests/queries/0_stateless/02771_semi_join_use_nulls.sql.j2 @@ -9,48 +9,39 @@ SET allow_experimental_analyzer = {{ allow_experimental_analyzer }}; SET join_use_nulls = {{ join_use_nulls }}; --- FORMAT Null is used because one side is undefined (e.g. for SEMI LEFT only columns from the left side contain values) - SELECT id > 1, d.id FROM (SELECT {{ maybe_materialize }}(toLowCardinality(0)) AS id, 1 AS value) AS a {{ strictness }} {{ kind }} JOIN (SELECT {{ maybe_materialize }}(toLowCardinality(0)) AS id) AS d USING (id) -FORMAT Null ; SELECT id > 1, d.id FROM (SELECT {{ maybe_materialize }}(toLowCardinality(toNullable(0))) AS id, 1 AS value) AS a {{ strictness }} {{ kind }} JOIN (SELECT {{ maybe_materialize }}(toLowCardinality(0)) AS id) AS d USING (id) -FORMAT Null ; SELECT id > 1, d.id FROM (SELECT {{ maybe_materialize }}(toLowCardinality(0)) AS id, 1 AS value) AS a {{ strictness }} {{ kind }} JOIN (SELECT {{ maybe_materialize }}(toLowCardinality(toNullable(0))) AS id) AS d USING (id) -FORMAT Null ; SELECT id > 1, d.id FROM (SELECT {{ maybe_materialize }}(toLowCardinality(toNullable(0))) AS id, 1 AS value) AS a {{ strictness }} {{ kind }} JOIN (SELECT {{ maybe_materialize }}(toLowCardinality(toNullable(0))) AS id) AS d USING (id) -FORMAT Null ; SELECT id > 1, d.id FROM (SELECT {{ maybe_materialize }}(toLowCardinality(1)) AS id, 1 AS value) AS a {{ strictness }} {{ kind }} JOIN (SELECT {{ maybe_materialize }}(toLowCardinality(2)) AS id) AS d USING (id) -FORMAT Null ; SELECT id > 1, d.id FROM (SELECT {{ maybe_materialize }}(toLowCardinality(0)) AS id, 1 AS value) AS a {{ strictness }} {{ kind }} JOIN (SELECT {{ maybe_materialize }}(toLowCardinality(0)) AS id) AS d ON a.id = d.id -FORMAT Null ; SELECT id > 1, d.idd FROM (SELECT {{ maybe_materialize }}(toLowCardinality(0)) AS id, 1 AS value) AS a {{ strictness }} {{ kind }} JOIN (SELECT {{ maybe_materialize }}(toLowCardinality(0)) AS idd) AS d ON a.id = d.idd -FORMAT Null ; {% endfor -%} From b11f744252b486ca0ba25deeb07181b4025e0edf Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 7 Jun 2023 20:33:08 +0200 Subject: [PATCH 0631/1072] Correctly disable async insert with deduplication when it's not needed (#50663) * Correctly disable async insert when it's not used * Better * Add comment * Better * Fix tests --------- Co-authored-by: Nikita Mikhaylov --- src/Core/ExternalTable.cpp | 2 +- src/Interpreters/GlobalSubqueriesVisitor.h | 2 +- src/Interpreters/InterpreterInsertQuery.cpp | 4 +-- .../Transforms/CreatingSetsTransform.cpp | 2 +- .../Transforms/buildPushingToViewsChain.cpp | 9 +++--- .../Transforms/buildPushingToViewsChain.h | 2 ++ src/Server/GRPCServer.cpp | 2 +- src/Server/TCPHandler.cpp | 2 +- src/Storages/HDFS/StorageHDFS.cpp | 2 +- src/Storages/HDFS/StorageHDFS.h | 2 +- src/Storages/Hive/StorageHive.cpp | 2 +- src/Storages/Hive/StorageHive.h | 2 +- src/Storages/IStorage.h | 5 +++- src/Storages/Kafka/StorageKafka.cpp | 2 +- src/Storages/Kafka/StorageKafka.h | 3 +- .../MeiliSearch/StorageMeiliSearch.cpp | 2 +- src/Storages/MeiliSearch/StorageMeiliSearch.h | 2 +- src/Storages/NATS/StorageNATS.cpp | 2 +- src/Storages/NATS/StorageNATS.h | 2 +- src/Storages/RabbitMQ/StorageRabbitMQ.cpp | 2 +- src/Storages/RabbitMQ/StorageRabbitMQ.h | 3 +- .../RocksDB/StorageEmbeddedRocksDB.cpp | 2 +- src/Storages/RocksDB/StorageEmbeddedRocksDB.h | 2 +- src/Storages/StorageBuffer.cpp | 2 +- src/Storages/StorageBuffer.h | 2 +- src/Storages/StorageDistributed.cpp | 4 +-- src/Storages/StorageDistributed.h | 2 +- src/Storages/StorageFile.cpp | 3 +- src/Storages/StorageFile.h | 3 +- src/Storages/StorageJoin.cpp | 4 +-- src/Storages/StorageJoin.h | 2 +- src/Storages/StorageKeeperMap.cpp | 2 +- src/Storages/StorageKeeperMap.h | 2 +- src/Storages/StorageLog.cpp | 2 +- src/Storages/StorageLog.h | 2 +- src/Storages/StorageMaterializedMySQL.h | 2 +- src/Storages/StorageMaterializedView.cpp | 4 +-- src/Storages/StorageMaterializedView.h | 2 +- src/Storages/StorageMemory.cpp | 2 +- src/Storages/StorageMemory.h | 2 +- src/Storages/StorageMergeTree.cpp | 2 +- src/Storages/StorageMergeTree.h | 2 +- src/Storages/StorageMongoDB.cpp | 2 +- src/Storages/StorageMongoDB.h | 3 +- src/Storages/StorageMySQL.cpp | 2 +- src/Storages/StorageMySQL.h | 2 +- src/Storages/StorageNull.h | 2 +- src/Storages/StoragePostgreSQL.cpp | 2 +- src/Storages/StoragePostgreSQL.h | 2 +- src/Storages/StorageProxy.h | 4 +-- src/Storages/StorageReplicatedMergeTree.cpp | 4 +-- src/Storages/StorageReplicatedMergeTree.h | 2 +- src/Storages/StorageS3.cpp | 2 +- src/Storages/StorageS3.h | 2 +- src/Storages/StorageSQLite.cpp | 2 +- src/Storages/StorageSQLite.h | 2 +- src/Storages/StorageSet.cpp | 2 +- src/Storages/StorageSet.h | 2 +- src/Storages/StorageStripeLog.cpp | 2 +- src/Storages/StorageStripeLog.h | 2 +- src/Storages/StorageTableFunction.h | 5 ++-- src/Storages/StorageURL.cpp | 2 +- src/Storages/StorageURL.h | 2 +- src/Storages/StorageXDBC.cpp | 2 +- src/Storages/StorageXDBC.h | 2 +- .../System/StorageSystemZooKeeper.cpp | 2 +- src/Storages/System/StorageSystemZooKeeper.h | 2 +- src/Storages/WindowView/StorageWindowView.cpp | 2 +- src/Storages/tests/gtest_storage_log.cpp | 2 +- ...sable_async_with_dedup_correctly.reference | 17 +++++++++++ ...2784_disable_async_with_dedup_correctly.sh | 29 +++++++++++++++++++ 71 files changed, 136 insertions(+), 78 deletions(-) create mode 100644 tests/queries/0_stateless/02784_disable_async_with_dedup_correctly.reference create mode 100755 tests/queries/0_stateless/02784_disable_async_with_dedup_correctly.sh diff --git a/src/Core/ExternalTable.cpp b/src/Core/ExternalTable.cpp index 0f880ed967f..676af280cad 100644 --- a/src/Core/ExternalTable.cpp +++ b/src/Core/ExternalTable.cpp @@ -167,7 +167,7 @@ void ExternalTablesHandler::handlePart(const Poco::Net::MessageHeader & header, auto temporary_table = TemporaryTableHolder(getContext(), ColumnsDescription{columns}, {}); auto storage = temporary_table.getTable(); getContext()->addExternalTable(data->table_name, std::move(temporary_table)); - auto sink = storage->write(ASTPtr(), storage->getInMemoryMetadataPtr(), getContext()); + auto sink = storage->write(ASTPtr(), storage->getInMemoryMetadataPtr(), getContext(), /*async_insert=*/false); /// Write data auto pipeline = QueryPipelineBuilder::getPipeline(std::move(*data->pipe)); diff --git a/src/Interpreters/GlobalSubqueriesVisitor.h b/src/Interpreters/GlobalSubqueriesVisitor.h index 08862032007..2901f2e23d0 100644 --- a/src/Interpreters/GlobalSubqueriesVisitor.h +++ b/src/Interpreters/GlobalSubqueriesVisitor.h @@ -170,7 +170,7 @@ public: else if (getContext()->getSettingsRef().use_index_for_in_with_subqueries) { auto external_table = external_storage_holder->getTable(); - auto table_out = external_table->write({}, external_table->getInMemoryMetadataPtr(), getContext()); + auto table_out = external_table->write({}, external_table->getInMemoryMetadataPtr(), getContext(), /*async_insert=*/false); auto io = interpreter->execute(); io.pipeline.complete(std::move(table_out)); CompletedPipelineExecutor executor(io.pipeline); diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index e87b16f0e9d..078499fb013 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -282,7 +282,7 @@ Chain InterpreterInsertQuery::buildSink( /// Otherwise we'll get duplicates when MV reads same rows again from Kafka. if (table->noPushingToViews() && !no_destination) { - auto sink = table->write(query_ptr, metadata_snapshot, context_ptr); + auto sink = table->write(query_ptr, metadata_snapshot, context_ptr, async_insert); sink->setRuntimeData(thread_status, elapsed_counter_ms); out.addSource(std::move(sink)); } @@ -290,7 +290,7 @@ Chain InterpreterInsertQuery::buildSink( { out = buildPushingToViewsChain(table, metadata_snapshot, context_ptr, query_ptr, no_destination, - thread_status_holder, running_group, elapsed_counter_ms); + thread_status_holder, running_group, elapsed_counter_ms, async_insert); } return out; diff --git a/src/Processors/Transforms/CreatingSetsTransform.cpp b/src/Processors/Transforms/CreatingSetsTransform.cpp index e3ae2d4fd4e..c6ac8bea5ba 100644 --- a/src/Processors/Transforms/CreatingSetsTransform.cpp +++ b/src/Processors/Transforms/CreatingSetsTransform.cpp @@ -91,7 +91,7 @@ void CreatingSetsTransform::startSubquery() if (subquery.table) /// TODO: make via port - table_out = QueryPipeline(subquery.table->write({}, subquery.table->getInMemoryMetadataPtr(), getContext())); + table_out = QueryPipeline(subquery.table->write({}, subquery.table->getInMemoryMetadataPtr(), getContext(), /*async_insert=*/false)); done_with_set = !subquery.set_in_progress; done_with_table = !subquery.table; diff --git a/src/Processors/Transforms/buildPushingToViewsChain.cpp b/src/Processors/Transforms/buildPushingToViewsChain.cpp index 31bab46b868..0d33f250b5d 100644 --- a/src/Processors/Transforms/buildPushingToViewsChain.cpp +++ b/src/Processors/Transforms/buildPushingToViewsChain.cpp @@ -196,6 +196,7 @@ Chain buildPushingToViewsChain( ThreadStatusesHolderPtr thread_status_holder, ThreadGroupPtr running_group, std::atomic_uint64_t * elapsed_counter_ms, + bool async_insert, const Block & live_view_header) { checkStackSize(); @@ -347,7 +348,7 @@ Chain buildPushingToViewsChain( out = buildPushingToViewsChain( view, view_metadata_snapshot, insert_context, ASTPtr(), /* no_destination= */ true, - thread_status_holder, running_group, view_counter_ms, storage_header); + thread_status_holder, running_group, view_counter_ms, async_insert, storage_header); } else if (auto * window_view = dynamic_cast(view.get())) { @@ -356,13 +357,13 @@ Chain buildPushingToViewsChain( out = buildPushingToViewsChain( view, view_metadata_snapshot, insert_context, ASTPtr(), /* no_destination= */ true, - thread_status_holder, running_group, view_counter_ms); + thread_status_holder, running_group, view_counter_ms, async_insert); } else out = buildPushingToViewsChain( view, view_metadata_snapshot, insert_context, ASTPtr(), /* no_destination= */ false, - thread_status_holder, running_group, view_counter_ms); + thread_status_holder, running_group, view_counter_ms, async_insert); views_data->views.emplace_back(ViewRuntimeData{ std::move(query), @@ -444,7 +445,7 @@ Chain buildPushingToViewsChain( /// Do not push to destination table if the flag is set else if (!no_destination) { - auto sink = storage->write(query_ptr, metadata_snapshot, context); + auto sink = storage->write(query_ptr, metadata_snapshot, context, async_insert); metadata_snapshot->check(sink->getHeader().getColumnsWithTypeAndName()); sink->setRuntimeData(thread_status, elapsed_counter_ms); result_chain.addSource(std::move(sink)); diff --git a/src/Processors/Transforms/buildPushingToViewsChain.h b/src/Processors/Transforms/buildPushingToViewsChain.h index 0f413bee5c6..53aceeda1cc 100644 --- a/src/Processors/Transforms/buildPushingToViewsChain.h +++ b/src/Processors/Transforms/buildPushingToViewsChain.h @@ -69,6 +69,8 @@ Chain buildPushingToViewsChain( ThreadGroupPtr running_group, /// Counter to measure time spent separately per view. Should be improved. std::atomic_uint64_t * elapsed_counter_ms, + /// True if it's part of async insert flush + bool async_insert, /// LiveView executes query itself, it needs source block structure. const Block & live_view_header = {}); diff --git a/src/Server/GRPCServer.cpp b/src/Server/GRPCServer.cpp index 7b8eaa21947..e335d247a82 100644 --- a/src/Server/GRPCServer.cpp +++ b/src/Server/GRPCServer.cpp @@ -1101,7 +1101,7 @@ namespace { /// The data will be written directly to the table. auto metadata_snapshot = storage->getInMemoryMetadataPtr(); - auto sink = storage->write(ASTPtr(), metadata_snapshot, query_context); + auto sink = storage->write(ASTPtr(), metadata_snapshot, query_context, /*async_insert=*/false); std::unique_ptr buf = std::make_unique(external_table.data().data(), external_table.data().size()); buf = wrapReadBufferWithCompressionMethod(std::move(buf), chooseCompressionMethod("", external_table.compression_type())); diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 0522b6d8a48..1ded7d97248 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -1692,7 +1692,7 @@ bool TCPHandler::receiveData(bool scalar) } auto metadata_snapshot = storage->getInMemoryMetadataPtr(); /// The data will be written directly to the table. - QueryPipeline temporary_table_out(storage->write(ASTPtr(), metadata_snapshot, query_context)); + QueryPipeline temporary_table_out(storage->write(ASTPtr(), metadata_snapshot, query_context, /*async_insert=*/false)); PushingPipelineExecutor executor(temporary_table_out); executor.start(); executor.push(block); diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 98c2579e355..194a8f982d8 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -624,7 +624,7 @@ Pipe StorageHDFS::read( return Pipe::unitePipes(std::move(pipes)); } -SinkToStoragePtr StorageHDFS::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context_) +SinkToStoragePtr StorageHDFS::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context_, bool /*async_insert*/) { String current_uri = uris.back(); diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index b123834e981..c6226c2618d 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -41,7 +41,7 @@ public: size_t max_block_size, size_t num_streams) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, bool async_insert) override; void truncate( const ASTPtr & query, diff --git a/src/Storages/Hive/StorageHive.cpp b/src/Storages/Hive/StorageHive.cpp index f554a14ec75..00c942fd56b 100644 --- a/src/Storages/Hive/StorageHive.cpp +++ b/src/Storages/Hive/StorageHive.cpp @@ -905,7 +905,7 @@ HiveFiles StorageHive::collectHiveFiles( return hive_files; } -SinkToStoragePtr StorageHive::write(const ASTPtr & /*query*/, const StorageMetadataPtr & /* metadata_snapshot*/, ContextPtr /*context*/) +SinkToStoragePtr StorageHive::write(const ASTPtr & /*query*/, const StorageMetadataPtr & /* metadata_snapshot*/, ContextPtr /*context*/, bool /*async_insert*/) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method write is not implemented for StorageHive"); } diff --git a/src/Storages/Hive/StorageHive.h b/src/Storages/Hive/StorageHive.h index b4afb2421b1..604df70f4d0 100644 --- a/src/Storages/Hive/StorageHive.h +++ b/src/Storages/Hive/StorageHive.h @@ -61,7 +61,7 @@ public: size_t max_block_size, size_t num_streams) override; - SinkToStoragePtr write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr /*context*/) override; + SinkToStoragePtr write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr /*context*/, bool async_insert) override; NamesAndTypesList getVirtuals() const override; diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index c163e8107ac..d44772850fd 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -402,11 +402,14 @@ public: * passed in all parts of the returned streams. Storage metadata can be * changed during lifetime of the returned streams, but the snapshot is * guaranteed to be immutable. + * + * async_insert - set to true if the write is part of async insert flushing */ virtual SinkToStoragePtr write( const ASTPtr & /*query*/, const StorageMetadataPtr & /*metadata_snapshot*/, - ContextPtr /*context*/) + ContextPtr /*context*/, + bool /*async_insert*/) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method write is not supported by storage {}", getName()); } diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index 7d504833a0a..2aba76c1a3f 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -374,7 +374,7 @@ Pipe StorageKafka::read( } -SinkToStoragePtr StorageKafka::write(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) +SinkToStoragePtr StorageKafka::write(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) { auto modified_context = Context::createCopy(local_context); modified_context->applySettingsChanges(settings_adjustments); diff --git a/src/Storages/Kafka/StorageKafka.h b/src/Storages/Kafka/StorageKafka.h index 3559129cf74..09aa091ef18 100644 --- a/src/Storages/Kafka/StorageKafka.h +++ b/src/Storages/Kafka/StorageKafka.h @@ -60,7 +60,8 @@ public: SinkToStoragePtr write( const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, - ContextPtr context) override; + ContextPtr context, + bool async_insert) override; /// We want to control the number of rows in a chunk inserted into Kafka bool prefersLargeBlocks() const override { return false; } diff --git a/src/Storages/MeiliSearch/StorageMeiliSearch.cpp b/src/Storages/MeiliSearch/StorageMeiliSearch.cpp index e7350d38a20..5d77fc080a4 100644 --- a/src/Storages/MeiliSearch/StorageMeiliSearch.cpp +++ b/src/Storages/MeiliSearch/StorageMeiliSearch.cpp @@ -137,7 +137,7 @@ Pipe StorageMeiliSearch::read( return Pipe(std::make_shared(config, sample_block, max_block_size, route, kv_pairs_params)); } -SinkToStoragePtr StorageMeiliSearch::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) +SinkToStoragePtr StorageMeiliSearch::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) { LOG_TRACE(log, "Trying update index: {}", config.index); return std::make_shared(config, metadata_snapshot->getSampleBlock(), local_context); diff --git a/src/Storages/MeiliSearch/StorageMeiliSearch.h b/src/Storages/MeiliSearch/StorageMeiliSearch.h index 41c1db53437..77cd2afb80a 100644 --- a/src/Storages/MeiliSearch/StorageMeiliSearch.h +++ b/src/Storages/MeiliSearch/StorageMeiliSearch.h @@ -26,7 +26,7 @@ public: size_t max_block_size, size_t num_streams) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool async_insert) override; static MeiliSearchConfiguration getConfiguration(ASTs engine_args, ContextPtr context); diff --git a/src/Storages/NATS/StorageNATS.cpp b/src/Storages/NATS/StorageNATS.cpp index aa4ec77b0d8..a3478069356 100644 --- a/src/Storages/NATS/StorageNATS.cpp +++ b/src/Storages/NATS/StorageNATS.cpp @@ -353,7 +353,7 @@ void StorageNATS::read( } -SinkToStoragePtr StorageNATS::write(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) +SinkToStoragePtr StorageNATS::write(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) { auto modified_context = addSettings(local_context); std::string subject = modified_context->getSettingsRef().stream_like_engine_insert_queue.changed diff --git a/src/Storages/NATS/StorageNATS.h b/src/Storages/NATS/StorageNATS.h index 518d81fb145..efe54243ee9 100644 --- a/src/Storages/NATS/StorageNATS.h +++ b/src/Storages/NATS/StorageNATS.h @@ -51,7 +51,7 @@ public: size_t /* max_block_size */, size_t /* num_streams */) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, bool async_insert) override; /// We want to control the number of rows in a chunk inserted into NATS bool prefersLargeBlocks() const override { return false; } diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp index e84f5c963a8..651c63e1b91 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp @@ -764,7 +764,7 @@ void StorageRabbitMQ::read( } -SinkToStoragePtr StorageRabbitMQ::write(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) +SinkToStoragePtr StorageRabbitMQ::write(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) { auto producer = std::make_unique( configuration, routing_keys, exchange_name, exchange_type, producer_id.fetch_add(1), persistent, shutdown_called, log); diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.h b/src/Storages/RabbitMQ/StorageRabbitMQ.h index c531026d83a..dc410c4f298 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.h +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.h @@ -57,7 +57,8 @@ public: SinkToStoragePtr write( const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, - ContextPtr context) override; + ContextPtr context, + bool async_insert) override; /// We want to control the number of rows in a chunk inserted into RabbitMQ bool prefersLargeBlocks() const override { return false; } diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp index d1195a9132e..27e8de78b0f 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp @@ -461,7 +461,7 @@ Pipe StorageEmbeddedRocksDB::read( } SinkToStoragePtr StorageEmbeddedRocksDB::write( - const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr /*context*/) + const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr /*context*/, bool /*async_insert*/) { return std::make_shared(*this, metadata_snapshot); } diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h index 32d7740009e..97fd07626a8 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h @@ -48,7 +48,7 @@ public: size_t max_block_size, size_t num_streams) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool async_insert) override; void truncate(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr, TableExclusiveLockHolder &) override; void checkMutationIsPossible(const MutationCommands & commands, const Settings & settings) const override; diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index a4cb15d5711..d021667f771 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -656,7 +656,7 @@ private: }; -SinkToStoragePtr StorageBuffer::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr /*context*/) +SinkToStoragePtr StorageBuffer::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr /*context*/, bool /*async_insert*/) { return std::make_shared(*this, metadata_snapshot); } diff --git a/src/Storages/StorageBuffer.h b/src/Storages/StorageBuffer.h index 83d2376216b..8f089a4d580 100644 --- a/src/Storages/StorageBuffer.h +++ b/src/Storages/StorageBuffer.h @@ -88,7 +88,7 @@ public: bool supportsSubcolumns() const override { return true; } - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool /*async_insert*/) override; void startup() override; /// Flush all buffers into the subordinate table and stop background thread. diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index bcf6f68d00d..d86e735f4b4 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -897,7 +897,7 @@ QueryTreeNodePtr executeSubqueryNode(const QueryTreeNodePtr & subquery_node, auto temporary_table_expression_node = std::make_shared(external_storage, mutable_context); temporary_table_expression_node->setTemporaryTableName(temporary_table_name); - auto table_out = external_storage->write({}, external_storage->getInMemoryMetadataPtr(), mutable_context); + auto table_out = external_storage->write({}, external_storage->getInMemoryMetadataPtr(), mutable_context, /*async_insert=*/false); auto io = interpreter.execute(); io.pipeline.complete(std::move(table_out)); CompletedPipelineExecutor executor(io.pipeline); @@ -1132,7 +1132,7 @@ void StorageDistributed::read( } -SinkToStoragePtr StorageDistributed::write(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) +SinkToStoragePtr StorageDistributed::write(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) { auto cluster = getCluster(); const auto & settings = local_context->getSettingsRef(); diff --git a/src/Storages/StorageDistributed.h b/src/Storages/StorageDistributed.h index 417fb6447bf..f45286341cf 100644 --- a/src/Storages/StorageDistributed.h +++ b/src/Storages/StorageDistributed.h @@ -118,7 +118,7 @@ public: bool supportsParallelInsert() const override { return true; } std::optional totalBytes(const Settings &) const override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool /*async_insert*/) override; std::optional distributedWrite(const ASTInsertQuery & query, ContextPtr context) override; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 052775aefca..72347789790 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1049,7 +1049,8 @@ private: SinkToStoragePtr StorageFile::write( const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, - ContextPtr context) + ContextPtr context, + bool /*async_insert*/) { if (format_name == "Distributed") throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method write is not implemented for Distributed format"); diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index 53ce7eeaaf6..9eb0d4b4383 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -50,7 +50,8 @@ public: SinkToStoragePtr write( const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, - ContextPtr context) override; + ContextPtr context, + bool async_insert) override; void truncate( const ASTPtr & /*query*/, diff --git a/src/Storages/StorageJoin.cpp b/src/Storages/StorageJoin.cpp index 5113320548d..a238e9ef26c 100644 --- a/src/Storages/StorageJoin.cpp +++ b/src/Storages/StorageJoin.cpp @@ -89,10 +89,10 @@ RWLockImpl::LockHolder StorageJoin::tryLockForCurrentQueryTimedWithContext(const return lock->getLock(type, query_id, acquire_timeout, false); } -SinkToStoragePtr StorageJoin::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) +SinkToStoragePtr StorageJoin::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, bool /*async_insert*/) { std::lock_guard mutate_lock(mutate_mutex); - return StorageSetOrJoinBase::write(query, metadata_snapshot, context); + return StorageSetOrJoinBase::write(query, metadata_snapshot, context, /*async_insert=*/false); } void StorageJoin::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr context, TableExclusiveLockHolder &) diff --git a/src/Storages/StorageJoin.h b/src/Storages/StorageJoin.h index a5e85d8788a..5559b5d1ec8 100644 --- a/src/Storages/StorageJoin.h +++ b/src/Storages/StorageJoin.h @@ -59,7 +59,7 @@ public: /// (but not during processing whole query, it's safe for joinGet that doesn't involve `used_flags` from HashJoin) ColumnWithTypeAndName joinGet(const Block & block, const Block & block_with_columns_to_add, ContextPtr context) const; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, bool async_insert) override; Pipe read( const Names & column_names, diff --git a/src/Storages/StorageKeeperMap.cpp b/src/Storages/StorageKeeperMap.cpp index deebb9e0096..f2b1b907832 100644 --- a/src/Storages/StorageKeeperMap.cpp +++ b/src/Storages/StorageKeeperMap.cpp @@ -524,7 +524,7 @@ Pipe StorageKeeperMap::read( return process_keys(std::move(filtered_keys)); } -SinkToStoragePtr StorageKeeperMap::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) +SinkToStoragePtr StorageKeeperMap::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) { checkTable(); return std::make_shared(*this, metadata_snapshot->getSampleBlock(), local_context); diff --git a/src/Storages/StorageKeeperMap.h b/src/Storages/StorageKeeperMap.h index 552e6b35fe8..ad7b719e972 100644 --- a/src/Storages/StorageKeeperMap.h +++ b/src/Storages/StorageKeeperMap.h @@ -42,7 +42,7 @@ public: size_t max_block_size, size_t num_streams) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, bool async_insert) override; void truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &) override; void drop() override; diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index 02dc4843660..ac68de43332 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -855,7 +855,7 @@ Pipe StorageLog::read( return Pipe::unitePipes(std::move(pipes)); } -SinkToStoragePtr StorageLog::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) +SinkToStoragePtr StorageLog::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) { WriteLock lock{rwlock, getLockTimeout(local_context)}; if (!lock) diff --git a/src/Storages/StorageLog.h b/src/Storages/StorageLog.h index a2b1356f240..f1d05ed39ac 100644 --- a/src/Storages/StorageLog.h +++ b/src/Storages/StorageLog.h @@ -55,7 +55,7 @@ public: size_t max_block_size, size_t num_streams) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool async_insert) override; void rename(const String & new_path_to_table_data, const StorageID & new_table_id) override; diff --git a/src/Storages/StorageMaterializedMySQL.h b/src/Storages/StorageMaterializedMySQL.h index 9896265b576..08fbb61960f 100644 --- a/src/Storages/StorageMaterializedMySQL.h +++ b/src/Storages/StorageMaterializedMySQL.h @@ -32,7 +32,7 @@ public: QueryProcessingStage::Enum processed_stage, size_t max_block_size, size_t num_streams) override; - SinkToStoragePtr write(const ASTPtr &, const StorageMetadataPtr &, ContextPtr) override { throwNotAllowed(); } + SinkToStoragePtr write(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, bool) override { throwNotAllowed(); } NamesAndTypesList getVirtuals() const override; ColumnSizeByName getColumnSizes() const override; diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index b96c132d601..16d724d54d8 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -192,13 +192,13 @@ void StorageMaterializedView::read( } } -SinkToStoragePtr StorageMaterializedView::write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr local_context) +SinkToStoragePtr StorageMaterializedView::write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr local_context, bool async_insert) { auto storage = getTargetTable(); auto lock = storage->lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = storage->getInMemoryMetadataPtr(); - auto sink = storage->write(query, metadata_snapshot, local_context); + auto sink = storage->write(query, metadata_snapshot, local_context, async_insert); sink->addTableLock(lock); return sink; diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h index f7876005c49..3ec789aa7e3 100644 --- a/src/Storages/StorageMaterializedView.h +++ b/src/Storages/StorageMaterializedView.h @@ -39,7 +39,7 @@ public: return target_table->mayBenefitFromIndexForIn(left_in_operand, query_context, metadata_snapshot); } - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool async_insert) override; void drop() override; void dropInnerTableIfAny(bool sync, ContextPtr local_context) override; diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index caeefa5d96d..1b45b9ae3f4 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -159,7 +159,7 @@ void StorageMemory::read( } -SinkToStoragePtr StorageMemory::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) +SinkToStoragePtr StorageMemory::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, bool /*async_insert*/) { return std::make_shared(*this, metadata_snapshot, context); } diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h index ce8a59b8bcd..c4f4331ca64 100644 --- a/src/Storages/StorageMemory.h +++ b/src/Storages/StorageMemory.h @@ -64,7 +64,7 @@ public: bool hasEvenlyDistributedRead() const override { return true; } - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, bool async_insert) override; void drop() override; diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 3da4724471d..a2a46229660 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -274,7 +274,7 @@ std::optional StorageMergeTree::totalBytes(const Settings &) const } SinkToStoragePtr -StorageMergeTree::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) +StorageMergeTree::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) { const auto & settings = local_context->getSettingsRef(); return std::make_shared( diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h index be9f5426bbd..8099f9c16aa 100644 --- a/src/Storages/StorageMergeTree.h +++ b/src/Storages/StorageMergeTree.h @@ -71,7 +71,7 @@ public: std::optional totalRowsByPartitionPredicate(const SelectQueryInfo &, ContextPtr) const override; std::optional totalBytes(const Settings &) const override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool async_insert) override; /** Perform the next step in combining the parts. */ diff --git a/src/Storages/StorageMongoDB.cpp b/src/Storages/StorageMongoDB.cpp index 63b8c2d00a1..ab52fde3e3d 100644 --- a/src/Storages/StorageMongoDB.cpp +++ b/src/Storages/StorageMongoDB.cpp @@ -165,7 +165,7 @@ Pipe StorageMongoDB::read( return Pipe(std::make_shared(connection, createCursor(database_name, collection_name, sample_block), sample_block, max_block_size)); } -SinkToStoragePtr StorageMongoDB::write(const ASTPtr & /* query */, const StorageMetadataPtr & metadata_snapshot, ContextPtr /* context */) +SinkToStoragePtr StorageMongoDB::write(const ASTPtr & /* query */, const StorageMetadataPtr & metadata_snapshot, ContextPtr /* context */, bool /*async_insert*/) { connectIfNotConnected(); return std::make_shared(collection_name, database_name, metadata_snapshot, connection); diff --git a/src/Storages/StorageMongoDB.h b/src/Storages/StorageMongoDB.h index 2b77f076e7e..36090d4584e 100644 --- a/src/Storages/StorageMongoDB.h +++ b/src/Storages/StorageMongoDB.h @@ -41,7 +41,8 @@ public: SinkToStoragePtr write( const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, - ContextPtr context) override; + ContextPtr context, + bool async_insert) override; struct Configuration { diff --git a/src/Storages/StorageMySQL.cpp b/src/Storages/StorageMySQL.cpp index 2a96581d591..3e928c3a811 100644 --- a/src/Storages/StorageMySQL.cpp +++ b/src/Storages/StorageMySQL.cpp @@ -252,7 +252,7 @@ private: }; -SinkToStoragePtr StorageMySQL::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) +SinkToStoragePtr StorageMySQL::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) { return std::make_shared( *this, diff --git a/src/Storages/StorageMySQL.h b/src/Storages/StorageMySQL.h index 9f47f9925d5..a98afc7ac4d 100644 --- a/src/Storages/StorageMySQL.h +++ b/src/Storages/StorageMySQL.h @@ -49,7 +49,7 @@ public: size_t max_block_size, size_t num_streams) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool async_insert) override; struct Configuration { diff --git a/src/Storages/StorageNull.h b/src/Storages/StorageNull.h index d35c6a0b8b5..f6dd7064a22 100644 --- a/src/Storages/StorageNull.h +++ b/src/Storages/StorageNull.h @@ -46,7 +46,7 @@ public: bool supportsParallelInsert() const override { return true; } - SinkToStoragePtr write(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr) override + SinkToStoragePtr write(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr, bool) override { return std::make_shared(metadata_snapshot->getSampleBlock()); } diff --git a/src/Storages/StoragePostgreSQL.cpp b/src/Storages/StoragePostgreSQL.cpp index 5d7dc285c5d..3551ee36819 100644 --- a/src/Storages/StoragePostgreSQL.cpp +++ b/src/Storages/StoragePostgreSQL.cpp @@ -451,7 +451,7 @@ private: SinkToStoragePtr StoragePostgreSQL::write( - const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr /* context */) + const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr /* context */, bool /*async_insert*/) { return std::make_shared(metadata_snapshot, pool->get(), remote_table_name, remote_table_schema, on_conflict); } diff --git a/src/Storages/StoragePostgreSQL.h b/src/Storages/StoragePostgreSQL.h index be6bbc5ec63..fb8b5a22df2 100644 --- a/src/Storages/StoragePostgreSQL.h +++ b/src/Storages/StoragePostgreSQL.h @@ -46,7 +46,7 @@ public: size_t max_block_size, size_t num_streams) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool async_insert) override; struct Configuration { diff --git a/src/Storages/StorageProxy.h b/src/Storages/StorageProxy.h index e8a664a6382..14b7fc15af2 100644 --- a/src/Storages/StorageProxy.h +++ b/src/Storages/StorageProxy.h @@ -68,9 +68,9 @@ public: return getNested()->read(query_plan, column_names, storage_snapshot, query_info, context, processed_stage, max_block_size, num_streams); } - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) override + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, bool async_insert) override { - return getNested()->write(query, metadata_snapshot, context); + return getNested()->write(query, metadata_snapshot, context, async_insert); } void drop() override { getNested()->drop(); } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index db9c209a5fd..f9c9e958258 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4822,7 +4822,7 @@ void StorageReplicatedMergeTree::assertNotReadonly() const } -SinkToStoragePtr StorageReplicatedMergeTree::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) +SinkToStoragePtr StorageReplicatedMergeTree::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool async_insert) { if (!initialization_done) throw Exception(ErrorCodes::NOT_INITIALIZED, "Table is not initialized yet"); @@ -4839,7 +4839,7 @@ SinkToStoragePtr StorageReplicatedMergeTree::write(const ASTPtr & /*query*/, con const auto storage_settings_ptr = getSettings(); const Settings & query_settings = local_context->getSettingsRef(); bool deduplicate = storage_settings_ptr->replicated_deduplication_window != 0 && query_settings.insert_deduplicate; - bool async_deduplicate = query_settings.async_insert && query_settings.async_insert_deduplicate && storage_settings_ptr->replicated_deduplication_window_for_async_inserts != 0 && query_settings.insert_deduplicate; + bool async_deduplicate = async_insert && query_settings.async_insert_deduplicate && storage_settings_ptr->replicated_deduplication_window_for_async_inserts != 0 && query_settings.insert_deduplicate; if (async_deduplicate) return std::make_shared( *this, metadata_snapshot, query_settings.insert_quorum.valueOr(0), diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index dd7ea84f76b..c0d9e36a8a7 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -138,7 +138,7 @@ public: std::optional totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, ContextPtr context) const override; std::optional totalBytes(const Settings & settings) const override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool async_insert) override; std::optional distributedWrite(const ASTInsertQuery & /*query*/, ContextPtr /*context*/) override; diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index f1a7bcb71a2..c3ed0f1af16 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -1086,7 +1086,7 @@ Pipe StorageS3::read( return Pipe::unitePipes(std::move(pipes)); } -SinkToStoragePtr StorageS3::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) +SinkToStoragePtr StorageS3::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) { auto query_configuration = updateConfigurationAndGetCopy(local_context); diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index a4c120b99a6..9c2728c785d 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -293,7 +293,7 @@ public: size_t max_block_size, size_t num_streams) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool async_insert) override; void truncate(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, TableExclusiveLockHolder &) override; diff --git a/src/Storages/StorageSQLite.cpp b/src/Storages/StorageSQLite.cpp index bd445217979..d5ae6f2383f 100644 --- a/src/Storages/StorageSQLite.cpp +++ b/src/Storages/StorageSQLite.cpp @@ -169,7 +169,7 @@ private: }; -SinkToStoragePtr StorageSQLite::write(const ASTPtr & /* query */, const StorageMetadataPtr & metadata_snapshot, ContextPtr) +SinkToStoragePtr StorageSQLite::write(const ASTPtr & /* query */, const StorageMetadataPtr & metadata_snapshot, ContextPtr /*context*/, bool /*async_insert*/) { if (!sqlite_db) sqlite_db = openSQLiteDB(database_path, getContext(), /* throw_on_error */true); diff --git a/src/Storages/StorageSQLite.h b/src/Storages/StorageSQLite.h index 323c29ac8bb..9da040cbd5c 100644 --- a/src/Storages/StorageSQLite.h +++ b/src/Storages/StorageSQLite.h @@ -40,7 +40,7 @@ public: size_t max_block_size, size_t num_streams) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool async_insert) override; static ColumnsDescription getTableStructureFromData( const SQLitePtr & sqlite_db_, diff --git a/src/Storages/StorageSet.cpp b/src/Storages/StorageSet.cpp index 00b5dbfc5e3..f90539689e6 100644 --- a/src/Storages/StorageSet.cpp +++ b/src/Storages/StorageSet.cpp @@ -106,7 +106,7 @@ void SetOrJoinSink::onFinish() } -SinkToStoragePtr StorageSetOrJoinBase::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) +SinkToStoragePtr StorageSetOrJoinBase::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, bool /*async_insert*/) { UInt64 id = ++increment; return std::make_shared( diff --git a/src/Storages/StorageSet.h b/src/Storages/StorageSet.h index ccd1eb9912b..b310f817eb9 100644 --- a/src/Storages/StorageSet.h +++ b/src/Storages/StorageSet.h @@ -24,7 +24,7 @@ class StorageSetOrJoinBase : public IStorage public: void rename(const String & new_path_to_table_data, const StorageID & new_table_id) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool async_insert) override; bool storesDataOnDisk() const override { return true; } Strings getDataPaths() const override { return {path}; } diff --git a/src/Storages/StorageStripeLog.cpp b/src/Storages/StorageStripeLog.cpp index b2e7c202800..5c704d877d1 100644 --- a/src/Storages/StorageStripeLog.cpp +++ b/src/Storages/StorageStripeLog.cpp @@ -394,7 +394,7 @@ Pipe StorageStripeLog::read( } -SinkToStoragePtr StorageStripeLog::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) +SinkToStoragePtr StorageStripeLog::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) { WriteLock lock{rwlock, getLockTimeout(local_context)}; if (!lock) diff --git a/src/Storages/StorageStripeLog.h b/src/Storages/StorageStripeLog.h index 3f1b4ed0ad5..f889a1de71b 100644 --- a/src/Storages/StorageStripeLog.h +++ b/src/Storages/StorageStripeLog.h @@ -49,7 +49,7 @@ public: size_t max_block_size, size_t num_streams) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool async_insert) override; void rename(const String & new_path_to_table_data, const StorageID & new_table_id) override; diff --git a/src/Storages/StorageTableFunction.h b/src/Storages/StorageTableFunction.h index ccec087a8d9..26cbe1f0233 100644 --- a/src/Storages/StorageTableFunction.h +++ b/src/Storages/StorageTableFunction.h @@ -130,7 +130,8 @@ public: SinkToStoragePtr write( const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, - ContextPtr context) override + ContextPtr context, + bool async_insert) override { auto storage = getNested(); auto cached_structure = metadata_snapshot->getSampleBlock(); @@ -139,7 +140,7 @@ public: { throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Source storage and table function have different structure"); } - return storage->write(query, metadata_snapshot, context); + return storage->write(query, metadata_snapshot, context, async_insert); } void renameInMemory(const StorageID & new_table_id) override diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index efc44a069dd..520576d3961 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -803,7 +803,7 @@ Pipe StorageURLWithFailover::read( } -SinkToStoragePtr IStorageURLBase::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) +SinkToStoragePtr IStorageURLBase::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, bool /*async_insert*/) { if (http_method.empty()) http_method = Poco::Net::HTTPRequest::HTTP_POST; diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index 316b142aec0..e80e19621e8 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -41,7 +41,7 @@ public: size_t max_block_size, size_t num_streams) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool async_insert) override; bool supportsPartitionBy() const override { return true; } diff --git a/src/Storages/StorageXDBC.cpp b/src/Storages/StorageXDBC.cpp index 9b3e203e337..b532d1c91f0 100644 --- a/src/Storages/StorageXDBC.cpp +++ b/src/Storages/StorageXDBC.cpp @@ -116,7 +116,7 @@ Pipe StorageXDBC::read( return IStorageURLBase::read(column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); } -SinkToStoragePtr StorageXDBC::write(const ASTPtr & /* query */, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) +SinkToStoragePtr StorageXDBC::write(const ASTPtr & /* query */, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) { bridge_helper->startBridgeSync(); diff --git a/src/Storages/StorageXDBC.h b/src/Storages/StorageXDBC.h index aa313e024ca..d7a1138c710 100644 --- a/src/Storages/StorageXDBC.h +++ b/src/Storages/StorageXDBC.h @@ -38,7 +38,7 @@ public: ContextPtr context_, BridgeHelperPtr bridge_helper_); - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool async_insert) override; std::string getName() const override; private: diff --git a/src/Storages/System/StorageSystemZooKeeper.cpp b/src/Storages/System/StorageSystemZooKeeper.cpp index 6ca74406b17..cef2feedcc5 100644 --- a/src/Storages/System/StorageSystemZooKeeper.cpp +++ b/src/Storages/System/StorageSystemZooKeeper.cpp @@ -215,7 +215,7 @@ void StorageSystemZooKeeper::read( query_plan.addStep(std::move(read_step)); } -SinkToStoragePtr StorageSystemZooKeeper::write(const ASTPtr &, const StorageMetadataPtr &, ContextPtr context) +SinkToStoragePtr StorageSystemZooKeeper::write(const ASTPtr &, const StorageMetadataPtr &, ContextPtr context, bool /*async_insert*/) { if (!context->getConfigRef().getBool("allow_zookeeper_write", false)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Prohibit writing to system.zookeeper, unless config `allow_zookeeper_write` as true"); diff --git a/src/Storages/System/StorageSystemZooKeeper.h b/src/Storages/System/StorageSystemZooKeeper.h index c8988d787a0..a016d3ad74c 100644 --- a/src/Storages/System/StorageSystemZooKeeper.h +++ b/src/Storages/System/StorageSystemZooKeeper.h @@ -20,7 +20,7 @@ public: static NamesAndTypesList getNamesAndTypes(); - SinkToStoragePtr write(const ASTPtr & /*query*/, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr /*context*/) override; + SinkToStoragePtr write(const ASTPtr & /*query*/, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr /*context*/, bool /*async_insert*/) override; void read( QueryPlan & query_plan, diff --git a/src/Storages/WindowView/StorageWindowView.cpp b/src/Storages/WindowView/StorageWindowView.cpp index 94d5db170a8..7fca9b5f078 100644 --- a/src/Storages/WindowView/StorageWindowView.cpp +++ b/src/Storages/WindowView/StorageWindowView.cpp @@ -1549,7 +1549,7 @@ void StorageWindowView::writeIntoWindowView( auto lock = inner_table->lockForShare( local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = inner_table->getInMemoryMetadataPtr(); - auto output = inner_table->write(window_view.getMergeableQuery(), metadata_snapshot, local_context); + auto output = inner_table->write(window_view.getMergeableQuery(), metadata_snapshot, local_context, /*async_insert=*/false); output->addTableLock(lock); if (!blocksHaveEqualStructure(builder.getHeader(), output->getHeader())) diff --git a/src/Storages/tests/gtest_storage_log.cpp b/src/Storages/tests/gtest_storage_log.cpp index b63de6a66ef..352c44554bd 100644 --- a/src/Storages/tests/gtest_storage_log.cpp +++ b/src/Storages/tests/gtest_storage_log.cpp @@ -95,7 +95,7 @@ std::string writeData(int rows, DB::StoragePtr & table, const DB::ContextPtr con block.insert(column); } - QueryPipeline pipeline(table->write({}, metadata_snapshot, context)); + QueryPipeline pipeline(table->write({}, metadata_snapshot, context, /*async_insert=*/false)); PushingPipelineExecutor executor(pipeline); executor.push(block); diff --git a/tests/queries/0_stateless/02784_disable_async_with_dedup_correctly.reference b/tests/queries/0_stateless/02784_disable_async_with_dedup_correctly.reference new file mode 100644 index 00000000000..014be4ce1a9 --- /dev/null +++ b/tests/queries/0_stateless/02784_disable_async_with_dedup_correctly.reference @@ -0,0 +1,17 @@ +0 +1 +1 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 diff --git a/tests/queries/0_stateless/02784_disable_async_with_dedup_correctly.sh b/tests/queries/0_stateless/02784_disable_async_with_dedup_correctly.sh new file mode 100755 index 00000000000..40e7c9feabf --- /dev/null +++ b/tests/queries/0_stateless/02784_disable_async_with_dedup_correctly.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS 02784_async_table_with_dedup" + +$CLICKHOUSE_CLIENT -q "CREATE TABLE 02784_async_table_with_dedup (a Int64) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/02784_async_table_with_dedup', 'r1') ORDER BY a" + +CLICKHOUSE_CLIENT_WITH_LOG=$(echo ${CLICKHOUSE_CLIENT} | sed 's/'"--send_logs_level=${CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL}"'/--send_logs_level=trace/g') + +function insert_with_log_check() { + $CLICKHOUSE_CLIENT_WITH_LOG --async-insert=1 --async_insert_deduplicate=1 --wait_for_async_insert=1 -q "$1" 2>&1 | grep -Fc "Setting async_insert=1, but INSERT query will be executed synchronously" +} + +insert_with_log_check "INSERT INTO 02784_async_table_with_dedup VALUES (1), (2)" +insert_with_log_check "INSERT INTO 02784_async_table_with_dedup SELECT number as a FROM system.numbers LIMIT 10 OFFSET 3" + +DATA_FILE=test_02784_async_$CLICKHOUSE_TEST_UNIQUE_NAME.csv +echo -e '13\n14' > $DATA_FILE + +insert_with_log_check "INSERT INTO 02784_async_table_with_dedup FROM INFILE '$DATA_FILE' FORMAT CSV" + +$CLICKHOUSE_CLIENT -q "SELECT a FROM 02784_async_table_with_dedup ORDER BY a" + +$CLICKHOUSE_CLIENT -q "DROP TABLE 02784_async_table_with_dedup" + +rm $DATA_FILE \ No newline at end of file From 129473ae744b7d8516e35ada5293e4b54be6f094 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 7 Jun 2023 20:48:03 +0200 Subject: [PATCH 0632/1072] Fix --- .../IO/CachedOnDiskWriteBufferFromFile.cpp | 30 ++++++++++--------- .../IO/CachedOnDiskWriteBufferFromFile.h | 4 +-- src/Interpreters/Cache/FileSegment.h | 6 ---- 3 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp index 9153af90312..b7727555480 100644 --- a/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp @@ -52,18 +52,20 @@ bool FileSegmentRangeWriter::write(const char * data, size_t size, size_t offset FileSegment * file_segment; - if (file_segments.empty() || file_segments.back().isDownloaded()) + if (!file_segments || file_segments->empty() || file_segments->front().isDownloaded()) { file_segment = &allocateFileSegment(expected_write_offset, segment_kind); } else { - file_segment = &file_segments.back(); + file_segment = &file_segments->front(); } SCOPE_EXIT({ - if (file_segments.back().isDownloader()) - file_segments.back().completePartAndResetDownloader(); + if (!file_segments || file_segments->empty()) + return; + if (file_segments->front().isDownloader()) + file_segments->front().completePartAndResetDownloader(); }); while (size > 0) @@ -71,7 +73,7 @@ bool FileSegmentRangeWriter::write(const char * data, size_t size, size_t offset size_t available_size = file_segment->range().size() - file_segment->getDownloadedSize(false); if (available_size == 0) { - completeFileSegment(*file_segment); + completeFileSegment(); file_segment = &allocateFileSegment(expected_write_offset, segment_kind); continue; } @@ -114,10 +116,7 @@ void FileSegmentRangeWriter::finalize() if (finalized) return; - if (file_segments.empty()) - return; - - completeFileSegment(file_segments.back()); + completeFileSegment(); finalized = true; } @@ -145,10 +144,9 @@ FileSegment & FileSegmentRangeWriter::allocateFileSegment(size_t offset, FileSeg /// We set max_file_segment_size to be downloaded, /// if we have less size to write, file segment will be resized in complete() method. - auto holder = cache->set(key, offset, cache->getMaxFileSegmentSize(), create_settings); - chassert(holder->size() == 1); - holder->moveTo(file_segments); - return file_segments.back(); + file_segments = cache->set(key, offset, cache->getMaxFileSegmentSize(), create_settings); + chassert(file_segments->size() == 1); + return file_segments->front(); } void FileSegmentRangeWriter::appendFilesystemCacheLog(const FileSegment & file_segment) @@ -176,8 +174,12 @@ void FileSegmentRangeWriter::appendFilesystemCacheLog(const FileSegment & file_s cache_log->add(elem); } -void FileSegmentRangeWriter::completeFileSegment(FileSegment & file_segment) +void FileSegmentRangeWriter::completeFileSegment() { + if (!file_segments || file_segments->empty()) + return; + + auto & file_segment = file_segments->front(); /// File segment can be detached if space reservation failed. if (file_segment.isDetached() || file_segment.isCompleted()) return; diff --git a/src/Disks/IO/CachedOnDiskWriteBufferFromFile.h b/src/Disks/IO/CachedOnDiskWriteBufferFromFile.h index 194afe88d88..8642886d6de 100644 --- a/src/Disks/IO/CachedOnDiskWriteBufferFromFile.h +++ b/src/Disks/IO/CachedOnDiskWriteBufferFromFile.h @@ -43,7 +43,7 @@ private: void appendFilesystemCacheLog(const FileSegment & file_segment); - void completeFileSegment(FileSegment & file_segment); + void completeFileSegment(); FileCache * cache; FileSegment::Key key; @@ -53,7 +53,7 @@ private: String query_id; String source_path; - FileSegmentsHolder file_segments{}; + FileSegmentsHolderPtr file_segments; size_t expected_write_offset = 0; diff --git a/src/Interpreters/Cache/FileSegment.h b/src/Interpreters/Cache/FileSegment.h index 163a15fcfda..186c65f12d9 100644 --- a/src/Interpreters/Cache/FileSegment.h +++ b/src/Interpreters/Cache/FileSegment.h @@ -360,12 +360,6 @@ struct FileSegmentsHolder : private boost::noncopyable FileSegments::const_iterator begin() const { return file_segments.begin(); } FileSegments::const_iterator end() const { return file_segments.end(); } - void moveTo(FileSegmentsHolder & holder) - { - holder.file_segments.insert(holder.file_segments.end(), file_segments.begin(), file_segments.end()); - file_segments.clear(); - } - private: FileSegments file_segments{}; const bool complete_on_dtor = true; From c2fc0713f2882e3ac09cc1f1c290180a29de180f Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Wed, 7 Jun 2023 21:08:48 +0200 Subject: [PATCH 0633/1072] Update FileCache_fwd.h --- src/Interpreters/Cache/FileCache_fwd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/Cache/FileCache_fwd.h b/src/Interpreters/Cache/FileCache_fwd.h index dbb85fa0e7a..01f518d0c4e 100644 --- a/src/Interpreters/Cache/FileCache_fwd.h +++ b/src/Interpreters/Cache/FileCache_fwd.h @@ -5,7 +5,7 @@ namespace DB { static constexpr int FILECACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE = 8 * 1024 * 1024; -static constexpr int FILECACHE_DEFAULT_MAX_ELEMENTS = 1024 * 1024; +static constexpr int FILECACHE_DEFAULT_MAX_ELEMENTS = 10000000; static constexpr int FILECACHE_DEFAULT_HITS_THRESHOLD = 0; static constexpr size_t FILECACHE_BYPASS_THRESHOLD = 256 * 1024 * 1024; static constexpr size_t FILECACHE_DELAYED_CLEANUP_INTERVAL_MS = 1000 * 60; /// 1 min From 1e4bcd0d5d6ed04a02ad27ebb89ebd46f8b46e4c Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Wed, 7 Jun 2023 21:44:57 +0200 Subject: [PATCH 0634/1072] Test to repro asan issue --- ...conditions_to_prewhere_analyzer_asan.reference | 4 ++++ ...e_all_conditions_to_prewhere_analyzer_asan.sql | 15 +++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 tests/queries/0_stateless/02784_move_all_conditions_to_prewhere_analyzer_asan.reference create mode 100644 tests/queries/0_stateless/02784_move_all_conditions_to_prewhere_analyzer_asan.sql diff --git a/tests/queries/0_stateless/02784_move_all_conditions_to_prewhere_analyzer_asan.reference b/tests/queries/0_stateless/02784_move_all_conditions_to_prewhere_analyzer_asan.reference new file mode 100644 index 00000000000..a08e8a1d440 --- /dev/null +++ b/tests/queries/0_stateless/02784_move_all_conditions_to_prewhere_analyzer_asan.reference @@ -0,0 +1,4 @@ +0 0 +0 0 +0 0 +0 0 diff --git a/tests/queries/0_stateless/02784_move_all_conditions_to_prewhere_analyzer_asan.sql b/tests/queries/0_stateless/02784_move_all_conditions_to_prewhere_analyzer_asan.sql new file mode 100644 index 00000000000..44b9ce4fdc1 --- /dev/null +++ b/tests/queries/0_stateless/02784_move_all_conditions_to_prewhere_analyzer_asan.sql @@ -0,0 +1,15 @@ +DROP TABLE IF EXISTS t_02784; + +CREATE TABLE t_02784 (c1 UInt64, c2 UInt64) ENGINE=MergeTree() ORDER BY c1 SETTINGS min_bytes_for_wide_part=1; + +INSERT INTO t_02784 SELECT number, number FROM numbers(1); + +SET allow_experimental_analyzer=1; +SET move_all_conditions_to_prewhere=1; + +SELECT c1, c2 FROM t_02784 WHERE c1 = 0 AND c2 = 0; +SELECT c1, c2 FROM t_02784 WHERE c2 = 0 AND c1 = 0; +SELECT c2, c1 FROM t_02784 WHERE c1 = 0 AND c2 = 0; +SELECT c2, c1 FROM t_02784 WHERE c2 = 0 AND c1 = 0; + +DROP TABLE t_02784; From 03b031eb401cbb46a5c25c0d66b9a8fae169b447 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Wed, 7 Jun 2023 21:48:54 +0200 Subject: [PATCH 0635/1072] Fix for using frame reference after stack was updated by previous optimizaton --- .../QueryPlan/Optimizations/optimizeTree.cpp | 45 ++++++++++--------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp index dd17c1b5a30..091eecf99e5 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp @@ -114,32 +114,35 @@ void optimizeTreeSecondPass(const QueryPlanOptimizationSettings & optimization_s while (!stack.empty()) { - auto & frame = stack.back(); - - if (frame.next_child == 0) { - has_reading_from_mt |= typeid_cast(frame.node->step.get()) != nullptr; + /// NOTE: frame cannot be safely used after adding new elements to stack + auto & frame = stack.back(); - if (optimization_settings.read_in_order) - optimizeReadInOrder(*frame.node, nodes); + if (frame.next_child == 0) + { + has_reading_from_mt |= typeid_cast(frame.node->step.get()) != nullptr; - if (optimization_settings.optimize_projection) - num_applied_projection += optimizeUseAggregateProjections(*frame.node, nodes); + if (optimization_settings.read_in_order) + optimizeReadInOrder(*frame.node, nodes); - if (optimization_settings.aggregation_in_order) - optimizeAggregationInOrder(*frame.node, nodes); + if (optimization_settings.optimize_projection) + num_applied_projection += optimizeUseAggregateProjections(*frame.node, nodes); - if (optimization_settings.distinct_in_order) - tryDistinctReadInOrder(frame.node); - } + if (optimization_settings.aggregation_in_order) + optimizeAggregationInOrder(*frame.node, nodes); - /// Traverse all children first. - if (frame.next_child < frame.node->children.size()) - { - auto next_frame = Frame{.node = frame.node->children[frame.next_child]}; - ++frame.next_child; - stack.push_back(next_frame); - continue; + if (optimization_settings.distinct_in_order) + tryDistinctReadInOrder(frame.node); + } + + /// Traverse all children first. + if (frame.next_child < frame.node->children.size()) + { + auto next_frame = Frame{.node = frame.node->children[frame.next_child]}; + ++frame.next_child; + stack.push_back(next_frame); + continue; + } } if (optimization_settings.optimize_projection) @@ -162,7 +165,7 @@ void optimizeTreeSecondPass(const QueryPlanOptimizationSettings & optimization_s optimizePrewhere(stack, nodes); optimizePrimaryKeyCondition(stack); - enableMemoryBoundMerging(*frame.node, nodes); + enableMemoryBoundMerging(*stack.back().node, nodes); stack.pop_back(); } From 92c87dedad043a17a1612f24b9fb43f175214b3f Mon Sep 17 00:00:00 2001 From: flynn Date: Thu, 8 Jun 2023 06:41:32 +0800 Subject: [PATCH 0636/1072] Add parallel state merge for some other combinator except If (#50413) * Add parallel state merge for some other combinator except If * add test * update test --- src/AggregateFunctions/AggregateFunctionArray.h | 7 +++++++ src/AggregateFunctions/AggregateFunctionMerge.h | 7 +++++++ src/AggregateFunctions/AggregateFunctionNull.h | 7 +++++++ src/AggregateFunctions/AggregateFunctionState.h | 7 +++++++ tests/performance/uniqExactIf.xml | 6 +++++- 5 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/AggregateFunctions/AggregateFunctionArray.h b/src/AggregateFunctions/AggregateFunctionArray.h index 21394e3ce05..7f38453f86b 100644 --- a/src/AggregateFunctions/AggregateFunctionArray.h +++ b/src/AggregateFunctions/AggregateFunctionArray.h @@ -141,6 +141,13 @@ public: nested_func->merge(place, rhs, arena); } + bool isAbleToParallelizeMerge() const override { return nested_func->isAbleToParallelizeMerge(); } + + void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena * arena) const override + { + nested_func->merge(place, rhs, thread_pool, arena); + } + void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional version) const override { nested_func->serialize(place, buf, version); diff --git a/src/AggregateFunctions/AggregateFunctionMerge.h b/src/AggregateFunctions/AggregateFunctionMerge.h index 0cb44259816..5b9e8e606af 100644 --- a/src/AggregateFunctions/AggregateFunctionMerge.h +++ b/src/AggregateFunctions/AggregateFunctionMerge.h @@ -110,6 +110,13 @@ public: nested_func->merge(place, rhs, arena); } + bool isAbleToParallelizeMerge() const override { return nested_func->isAbleToParallelizeMerge(); } + + void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena * arena) const override + { + nested_func->merge(place, rhs, thread_pool, arena); + } + void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional version) const override { nested_func->serialize(place, buf, version); diff --git a/src/AggregateFunctions/AggregateFunctionNull.h b/src/AggregateFunctions/AggregateFunctionNull.h index b817bad82fa..de7b190c949 100644 --- a/src/AggregateFunctions/AggregateFunctionNull.h +++ b/src/AggregateFunctions/AggregateFunctionNull.h @@ -148,6 +148,13 @@ public: nested_function->merge(nestedPlace(place), nestedPlace(rhs), arena); } + bool isAbleToParallelizeMerge() const override { return nested_function->isAbleToParallelizeMerge(); } + + void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena * arena) const override + { + nested_function->merge(nestedPlace(place), nestedPlace(rhs), thread_pool, arena); + } + void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional version) const override { bool flag = getFlag(place); diff --git a/src/AggregateFunctions/AggregateFunctionState.h b/src/AggregateFunctions/AggregateFunctionState.h index 625fe1f36bc..8335d21cb1e 100644 --- a/src/AggregateFunctions/AggregateFunctionState.h +++ b/src/AggregateFunctions/AggregateFunctionState.h @@ -91,6 +91,13 @@ public: nested_func->merge(place, rhs, arena); } + bool isAbleToParallelizeMerge() const override { return nested_func->isAbleToParallelizeMerge(); } + + void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena * arena) const override + { + nested_func->merge(place, rhs, thread_pool, arena); + } + void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional version) const override { nested_func->serialize(place, buf, version); diff --git a/tests/performance/uniqExactIf.xml b/tests/performance/uniqExactIf.xml index 409be257737..263fa75382c 100644 --- a/tests/performance/uniqExactIf.xml +++ b/tests/performance/uniqExactIf.xml @@ -1,3 +1,7 @@ - SELECT uniqExactIf(number, 1) FROM numbers_mt(1e6) + SELECT uniqExactIf(number, 1) FROM numbers_mt(1e7) + SELECT uniqExactState(number) FROM numbers_mt(1e7) Format Null + SELECT uniqExactArray([number]) FROM numbers_mt(1e7) Format Null + with (SELECT uniqExactState(number) FROM numbers_mt(1e7)) as a select uniqExactMerge(a) + SELECT uniqExactOrNull(number) FROM numbers_mt(1e7) From d3eb0805d44564fbbf093c5c1d39fbad532b0f59 Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Wed, 7 Jun 2023 23:28:19 +0000 Subject: [PATCH 0637/1072] clang-tidy run + changes in docs --- docs/en/interfaces/cli.md | 12 ++++++------ docs/ru/interfaces/cli.md | 10 +++++----- src/Client/ConnectionString.cpp | 23 ++++++++++++----------- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index 94f1fbf9e41..ba54694faa9 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -198,9 +198,9 @@ If host is not specified, the default host will be used (localhost). If port is not specified, the default port will be used (9000). If database is not specified, the default database will be used. -User, password, and database can be specified in the connection string either in --user, --password, --database command line options. +User, password, and database can be specified in the connection string either in `--user`, `--password`, `--database` command line options. -The connection string must be specified in the first argument of clickhouse-client. The connection string can be combined with other [command-line-options](#command-line-options) except **--host(h)** and **--port**. +The connection string must be specified in the first argument of clickhouse-client. The connection string can be combined with other [command-line-options](#command-line-options) except `--host(h)` and `--port`. ### Multiple hosts {#connection_string_multiple_hosts} @@ -226,25 +226,25 @@ Connect to localhost using port 9000 in interactive, multiline mode. clickhouse-client "clickhouse://localhost:9000" -m ``` -Connect to localhost using port 9000 in interactive mode with the user specified in --user option. +Connect to localhost using port 9000 in interactive mode with the user specified in `--user` option. ``` bash clickhouse-client "clickhouse://localhost:9000" --user default ``` -Connect to localhost using port 9000 in interactive mode with database 'my_database' specified in command line option +Connect to localhost using port 9000 in interactive mode to `my_database` database specified in command line option ``` bash clickhouse-client "clickhouse://localhost:9000" --database my_database ``` -Connect to localhost using port 9000 in interactive mode with the database specified in the connection string. +Connect to localhost using port 9000 in interactive mode to `my_database` database specified in the connection string. ``` bash clickhouse-client "clickhouse://localhost:9000/my_database" ``` -Connect to localhost using port 9000 in interactive mode with a database specified in the connection string and a secure connection using shorthanded 's' URI parameter. +Connect to localhost using port 9000 in interactive mode to `my_database` database specified in the connection string and a secure connection using shorthanded 's' URI parameter. ```bash clickhouse-client "clickhouse://localhost/my_database?s" diff --git a/docs/ru/interfaces/cli.md b/docs/ru/interfaces/cli.md index 30cd9757ebb..5f119ad9544 100644 --- a/docs/ru/interfaces/cli.md +++ b/docs/ru/interfaces/cli.md @@ -151,9 +151,9 @@ clickhouse://[2001:db8::1234] Если port не указан, будет использоваться порт по умолчанию (9000). Если база данных не указана, будет использоваться база данных по умолчанию (default). -Пользователь, пароль и база данных могут быть указаны в строке подключения либо в опциях командной строки --user, --password, --database. +Пользователь, пароль и база данных могут быть указаны в строке подключения либо в опциях командной строки `--user`, `--password`, `--database`. -Строка подключения должна быть указана в первом аргументе clickhouse-client. Строка подключения может комбинироваться с другими [параметрами командной строки] (#command-line-options) кроме **--host(h)** и **--port**. +Строка подключения должна быть указана в первом аргументе clickhouse-client. Строка подключения может комбинироваться с другими [параметрами командной строки] (#command-line-options) кроме `--host (h)` и `--port`. ### Несколько хостов {#connection_string_multiple_hosts} @@ -185,19 +185,19 @@ clickhouse-client "clickhouse://localhost:9000" -m clickhouse-client "clickhouse://localhost:9000" --user default ``` -Подключиться к localhost, используя порт 9000 в интерактивном режиме с базой данных 'my_database', указанной в опции командной строки. +Подключиться к localhost, используя порт 9000 в интерактивном режиме с базой данных `my_database`, указанной в опции командной строки. ``` bash clickhouse-client "clickhouse://localhost:9000" --database my_database ``` -Подключиться к localhost через порт 9000 в интерактивном режиме с базой данных my_database, указанной в строке подключения. +Подключиться к localhost через порт 9000 в интерактивном режиме с базой данных `my_database`, указанной в строке подключения. ``` bash clickhouse-client "clickhouse://localhost:9000/my_database" ``` -Подключиться к localhost через порт 9000 в интерактивном режиме с базой данных, указанной в строке подключения, и безопасным соединением с использованием сокращенного параметра URI 's'. +Подключиться к localhost через порт 9000 в интерактивном режиме с базой данных `my_database`, указанной в строке подключения, и безопасным соединением, используя короткий вариант команды URI 's'. ``` bash clickhouse-client "clickhouse://localhost/my_database?s" diff --git a/src/Client/ConnectionString.cpp b/src/Client/ConnectionString.cpp index aeb1c1dca02..95fec5b52ee 100644 --- a/src/Client/ConnectionString.cpp +++ b/src/Client/ConnectionString.cpp @@ -128,20 +128,21 @@ bool tryParseConnectionString( else hosts_end_pos = hosts_or_user_info_end_pos; - auto hosts_end = hosts_end_pos != std::string_view::npos ? connection_string.begin() + hosts_end_pos - : connection_string.end(); + const auto * hosts_end = hosts_end_pos != std::string_view::npos ? connection_string.begin() + hosts_end_pos + : connection_string.end(); try { - // Poco::URI doesn't support several hosts in URI. - // Split string clickhouse:[user_info]host1:port1, ... , hostN:portN[database]?[query_parameters] - // into multiple string for each host: - // clickhouse:[user_info]host1:port1[database]?[query_parameters] - // ... - // clickhouse:[user_info]hostN:portN[database]?[query_parameters] + /** Poco::URI doesn't support several hosts in URI. + * Split string clickhouse:[user_info]host1:port1, ... , hostN:portN[database]?[query_parameters] + * into multiple string for each host: + * clickhouse:[user_info]host1:port1[database]?[query_parameters] + * ... + * clickhouse:[user_info]hostN:portN[database]?[query_parameters] + */ Poco::URI uri; - auto last_host_begin = connection_string.begin() + offset; - for (auto it = last_host_begin; it != hosts_end; ++it) + const auto * last_host_begin = connection_string.begin() + offset; + for (const auto * it = last_host_begin; it != hosts_end; ++it) { if (*it == ',') { @@ -198,7 +199,7 @@ bool tryParseConnectionString( } const auto & database_name = uri.getPath(); - size_t start_symbol = database_name.size() > 0u && database_name[0] == '/' ? 1u : 0u; + size_t start_symbol = !database_name.empty() && database_name[0] == '/' ? 1u : 0u; if (database_name.size() > start_symbol) { common_arguments.push_back("--database"); From cf24d70bfd54dd45606ad1c53a136495c317bb9f Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Thu, 8 Jun 2023 02:20:50 +0000 Subject: [PATCH 0638/1072] minor documentation changes --- docs/en/interfaces/cli.md | 18 +++++++++--------- docs/ru/interfaces/cli.md | 10 +++++----- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index ba54694faa9..c36887672c7 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -166,13 +166,13 @@ The connection string for clickhouse-client is presented in URI format: clickhouse://[user_info@][hosts_and_ports][/dbname][?query_parameters] ``` -where user_info is: ```user[:password]``` -and hosts_and_ports is a list of values: ```[host][:port],[host][:port]``` Port is not mandatory. -and query_parameters is a list of parameter[=value]: ```param_name[=value]¶m_name[=value]...``` value may not be required for some of parameters. Parameter names are case sensitive. +where user_info is: `user[:password]` +and hosts_and_ports is a list of values: `[host][:port],[host][:port]` Port is not mandatory. +and query_parameters is a list of parameter[=value]: `param_name[=value]¶m_name[=value]...` value may not be required for some of the parameters. Parameter names are case sensitive. Allowed query_parameters keys: -- **secure** or shorthanded **s** - no value. If specified, client will connect to the server over a secure connection (TLS). See **secure** in [command-line-options](#command-line-options) +- `secure` or shorthanded `s` - no value. If specified, client will connect to the server over a secure connection (TLS). See `secure` in [command-line-options](#command-line-options) These examples illustrate valid connection strings for clickhouse-client: @@ -210,11 +210,11 @@ If more than one host is supplied, or if a single host name is translated to mor ### Percent encoding {#connection_string_uri_percent_encoding} -Hosts, user name, password, database and query parameters should be [Percent-Encoded](https://en.wikipedia.org/wiki/URL_encoding) if values contain URI invalid characters. +Hosts, user name, password, database, and query parameters should be [Percent-Encoded](https://en.wikipedia.org/wiki/URL_encoding) if values contain invalid URI characters. ### Examples {#connection_string_examples} -Connect to localhost using port 9000 and executes the query "SELECT 1". +Connect to localhost using port 9000 and execute the query "SELECT 1". ``` bash clickhouse-client "clickhouse://localhost:9000" --query "SELECT 1" @@ -232,7 +232,7 @@ Connect to localhost using port 9000 in interactive mode with the user specified clickhouse-client "clickhouse://localhost:9000" --user default ``` -Connect to localhost using port 9000 in interactive mode to `my_database` database specified in command line option +Connect to localhost using port 9000 in interactive mode to `my_database` database specified in the command line option. ``` bash clickhouse-client "clickhouse://localhost:9000" --database my_database @@ -250,7 +250,7 @@ Connect to localhost using port 9000 in interactive mode to `my_database` databa clickhouse-client "clickhouse://localhost/my_database?s" ``` -Connect to default host using the default port, default user, and default database. +Connect to default host using default port, default user, and default database. ``` bash clickhouse-client "clickhouse:" @@ -262,7 +262,7 @@ Connect to the default host using the default port, using user user_name and no clickhouse-client "clickhouse://user_name@" ``` -Connect to localhost using email user name. Symbol '@' is percent encoded to '%40'. +Connect to localhost using email as the user name. `@` symbol is percent encoded to `%40`. ``` bash clickhouse-client "clickhouse://some_user%40some_mail.com@localhost:9000" diff --git a/docs/ru/interfaces/cli.md b/docs/ru/interfaces/cli.md index 5f119ad9544..801a72e48ec 100644 --- a/docs/ru/interfaces/cli.md +++ b/docs/ru/interfaces/cli.md @@ -118,14 +118,14 @@ $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="numbe clickhouse://[user_info@][hosts_and_ports][/dbname][?query_parameters] ``` -где user_info - это: ```user[:password]``` -hosts_and_ports - это список значений: ```[host][:port],[host][:port]```. Port может быть не задан. -query_parameters - это список пар ключ[=значение]: ```param_name[=value]¶m_name[=value]...```. Значение может быть пустым. +где user_info - это: `user[:password]` +hosts_and_ports - это список значений: `[host][:port],[host][:port]`. Port может быть не задан. +query_parameters - это список пар ключ[=значение]: `param_name[=value]¶m_name[=value]...`. Значение может быть пустым. Имена параметров чувствительны к регистру. Допустимые ключи query_parameters: -- **secure** или сокращенно **s** - без значение. Если параметр указан, то соединение с сервером будет осуществляться по защищенному каналу (TLS). См. **secure** в [command-line-options](#command-line-options). +- `secure` или сокращенно `s` - без значение. Если параметр указан, то соединение с сервером будет осуществляться по защищенному каналу (TLS). См. `secure` в [command-line-options](#command-line-options). Эти примеры иллюстрируют допустимые строки подключения для clickhouse-client: @@ -215,7 +215,7 @@ clickhouse-client "clickhouse:" clickhouse-client "clickhouse://user_name@" ``` -Подключиться к localhost, используя электронную почту, как имя пользователя. Символ '@' закодирован как '%40'. +Подключиться к localhost, используя электронную почту, как имя пользователя. Символ `@` закодирован как `%40`. ``` bash clickhouse-client "clickhouse://some_user%40some_mail.com@localhost:9000" From 286f3b247b33b08b17bd76320604d9b2b1c282b4 Mon Sep 17 00:00:00 2001 From: Derek Chia Date: Thu, 8 Jun 2023 11:28:44 +0800 Subject: [PATCH 0639/1072] Update settings.md --- docs/en/operations/settings/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 8e2cd8d6027..3968751d5ee 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3155,7 +3155,7 @@ Possible values: - Positive integer. - 0 or 1 — Disabled. `SELECT` queries are executed in a single thread. -Default value: the number of physical CPU cores. +Default value: `max_threads`. ## opentelemetry_start_trace_probability {#opentelemetry-start-trace-probability} From 7263769d20a30747f7a80a45a6def3abf41cccfa Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 8 Jun 2023 06:12:54 +0000 Subject: [PATCH 0640/1072] Add constexpr / fix date check --- src/Functions/DateTimeTransforms.h | 8 ++------ .../0_stateless/01746_convert_type_with_default.reference | 1 - .../0_stateless/01746_convert_type_with_default.sql | 1 - 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index 823272e0324..09b0d71daf8 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -1449,18 +1449,14 @@ struct Transformer { bool check_range_result = true; - if constexpr (std::is_same_v) - { - check_range_result = vec_from[i] >= 0 && vec_from[i] <= DATE_LUT_MAX_DAY_NUM; - } - else if constexpr (std::is_same_v) + if constexpr (std::is_same_v || std::is_same_v) { check_range_result = vec_from[i] >= 0 && vec_from[i] <= 0xFFFFFFFFL; } if (!check_range_result) { - if (std::is_same_v) + if constexpr (std::is_same_v) { vec_to[i] = 0; if (vec_null_map_to) diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.reference b/tests/queries/0_stateless/01746_convert_type_with_default.reference index 85bf2064fdc..959ee29b5e7 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.reference +++ b/tests/queries/0_stateless/01746_convert_type_with_default.reference @@ -45,7 +45,6 @@ 2023-05-30 14:38:20 2023-05-30 14:38:20 2023-05-30 14:38:20 -2023-05-30 14:38:20 1970-01-01 00:00:19 1970-01-01 00:00:19 1970-01-01 00:00:19 diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.sql b/tests/queries/0_stateless/01746_convert_type_with_default.sql index 1065eefa94d..099652a8a39 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.sql +++ b/tests/queries/0_stateless/01746_convert_type_with_default.sql @@ -56,7 +56,6 @@ select toDateOrDefault(cast(19 as UInt256)); select toDateOrDefault(19507, '2000-01-01'::Date); select toDateOrDefault(-1, '2023-05-30'::Date); -select toDateTimeOrDefault('2023-05-30 14:38:20'); select toDateTimeOrDefault('2023-05-30 14:38:20', 'UTC'); select toDateTimeOrDefault('1xxx', 'UTC', '2023-05-30 14:38:20'::DateTime('UTC')); select toDateTimeOrDefault(1685457500, 'UTC'); From f5816c27aa3c675c3cd02ce292675d2214e5e56f Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 8 Jun 2023 06:20:14 +0000 Subject: [PATCH 0641/1072] Use hex value in tests --- tests/queries/0_stateless/01601_accurate_cast.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/01601_accurate_cast.sql b/tests/queries/0_stateless/01601_accurate_cast.sql index f7f4d588ccc..5555129f0ad 100644 --- a/tests/queries/0_stateless/01601_accurate_cast.sql +++ b/tests/queries/0_stateless/01601_accurate_cast.sql @@ -24,13 +24,13 @@ SELECT accurateCast('123', 'FixedString(2)'); -- { serverError 131 } SELECT accurateCast('12', 'FixedString(2)'); SELECT accurateCast(-1, 'DateTime'); -- { serverError 70 } -SELECT accurateCast(5000000000, 'DateTime'); -- { serverError 70 } +SELECT accurateCast(0xFFFFFFFF + 1, 'DateTime'); -- { serverError 70 } SELECT accurateCast('1xxx', 'DateTime'); -- { serverError 41 } SELECT accurateCast('2023-05-30 14:38:20', 'DateTime'); SELECT toString(accurateCast(19, 'DateTime'), 'UTC'); SELECT accurateCast(-1, 'Date'); -- { serverError 70 } -SELECT accurateCast(999999, 'Date'); -- { serverError 70 } +SELECT accurateCast(0xFFFFFFFF + 1, 'Date'); -- { serverError 70 } SELECT accurateCast('1xxx', 'Date'); -- { serverError 38 } SELECT accurateCast('2023-05-30', 'Date'); SELECT accurateCast(19, 'Date'); From 2297995c1a46240145df4b04ba7fff727944f5e7 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 8 Jun 2023 07:15:11 +0000 Subject: [PATCH 0642/1072] Disable grace_hash join in stress tests Until https://github.com/ClickHouse/ClickHouse/issues/50220 is fixed --- tests/ci/stress.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/ci/stress.py b/tests/ci/stress.py index b9044874071..e370ddbdd21 100755 --- a/tests/ci/stress.py +++ b/tests/ci/stress.py @@ -37,9 +37,6 @@ def get_options(i, upgrade_check): client_options.append("join_algorithm='partial_merge'") if join_alg_num % 5 == 2: client_options.append("join_algorithm='full_sorting_merge'") - if join_alg_num % 5 == 3 and not upgrade_check: - # Some crashes are not fixed in 23.2 yet, so ignore the setting in Upgrade check - client_options.append("join_algorithm='grace_hash'") if join_alg_num % 5 == 4: client_options.append("join_algorithm='auto'") client_options.append("max_rows_in_join=1000") From b878ddb35fce2247d657e3d2c2156023b4c67bdd Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Thu, 8 Jun 2023 11:02:09 +0200 Subject: [PATCH 0643/1072] Fix test --- tests/queries/0_stateless/02344_describe_cache.reference | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02344_describe_cache.reference b/tests/queries/0_stateless/02344_describe_cache.reference index a803ca1fab1..d8a2ffab1fa 100644 --- a/tests/queries/0_stateless/02344_describe_cache.reference +++ b/tests/queries/0_stateless/02344_describe_cache.reference @@ -1,2 +1,2 @@ -134217728 1048576 8388608 1 0 0 0 /var/lib/clickhouse/caches/s3_cache/ 0 -134217728 1048576 104857600 0 0 0 0 /var/lib/clickhouse/caches/s3_cache_2/ 0 +134217728 10000000 8388608 1 0 0 0 /var/lib/clickhouse/caches/s3_cache/ 0 +134217728 10000000 104857600 0 0 0 0 /var/lib/clickhouse/caches/s3_cache_2/ 0 From 17a560cca7cd5bb71bf673fafea76695855b3c6a Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 8 Jun 2023 09:11:02 +0000 Subject: [PATCH 0644/1072] Add datetime test --- .../0_stateless/01746_convert_type_with_default.reference | 2 ++ tests/queries/0_stateless/01746_convert_type_with_default.sql | 3 +++ 2 files changed, 5 insertions(+) diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.reference b/tests/queries/0_stateless/01746_convert_type_with_default.reference index 959ee29b5e7..541580d67f5 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.reference +++ b/tests/queries/0_stateless/01746_convert_type_with_default.reference @@ -39,6 +39,8 @@ 1970-01-20 1970-01-20 1970-01-20 +2149-06-06 +1970-01-01 2023-05-30 2023-05-30 2023-05-30 14:38:20 diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.sql b/tests/queries/0_stateless/01746_convert_type_with_default.sql index 099652a8a39..2620780cfb9 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.sql +++ b/tests/queries/0_stateless/01746_convert_type_with_default.sql @@ -53,6 +53,9 @@ select toDateOrDefault(cast(19 as UInt128)); select toDateOrDefault(cast(19 as Int256)); select toDateOrDefault(cast(19 as UInt256)); +select toDateOrDefault(65535); +select toDateOrDefault(65536); + select toDateOrDefault(19507, '2000-01-01'::Date); select toDateOrDefault(-1, '2023-05-30'::Date); From 15b6651df6f10e7f623ec74dd3aaa42b82df90a5 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 8 Jun 2023 09:40:11 +0000 Subject: [PATCH 0645/1072] Revert "Merge pull request #49816 from bigo-sg/grace_hash_reserve_hash_table" This reverts commit 3f892ceb12b868b1b9deb0607a7df2dde5f1a139, reversing changes made to 32ffa2ae0b781af5c7ae3e5cbf975d3e9b74d86f. --- src/Interpreters/GraceHashJoin.cpp | 17 +++++------------ src/Interpreters/GraceHashJoin.h | 3 +-- src/Interpreters/HashJoin.cpp | 8 +++----- src/Interpreters/HashJoin.h | 30 ++---------------------------- 4 files changed, 11 insertions(+), 47 deletions(-) diff --git a/src/Interpreters/GraceHashJoin.cpp b/src/Interpreters/GraceHashJoin.cpp index f54ee9d85c7..4a4c69ff473 100644 --- a/src/Interpreters/GraceHashJoin.cpp +++ b/src/Interpreters/GraceHashJoin.cpp @@ -571,13 +571,7 @@ IBlocksStreamPtr GraceHashJoin::getDelayedBlocks() size_t bucket_idx = current_bucket->idx; - size_t prev_keys_num = 0; - // If there is only one bucket, don't take this check. - if (hash_join && buckets.size() > 1) - { - // Use previous hash_join's keys number to estimate next hash_join's size is reasonable. - prev_keys_num = hash_join->getTotalRowCount(); - } + hash_join = makeInMemoryJoin(); for (bucket_idx = bucket_idx + 1; bucket_idx < buckets.size(); ++bucket_idx) { @@ -591,7 +585,6 @@ IBlocksStreamPtr GraceHashJoin::getDelayedBlocks() continue; } - hash_join = makeInMemoryJoin(prev_keys_num); auto right_reader = current_bucket->startJoining(); size_t num_rows = 0; /// count rows that were written and rehashed while (Block block = right_reader.read()) @@ -611,9 +604,9 @@ IBlocksStreamPtr GraceHashJoin::getDelayedBlocks() return nullptr; } -GraceHashJoin::InMemoryJoinPtr GraceHashJoin::makeInMemoryJoin(size_t reserve_num) +GraceHashJoin::InMemoryJoinPtr GraceHashJoin::makeInMemoryJoin() { - return std::make_unique(table_join, right_sample_block, any_take_last_row, reserve_num); + return std::make_unique(table_join, right_sample_block, any_take_last_row); } Block GraceHashJoin::prepareRightBlock(const Block & block) @@ -653,7 +646,6 @@ void GraceHashJoin::addJoinedBlockImpl(Block block) if (!current_block.rows()) return; } - auto prev_keys_num = hash_join->getTotalRowCount(); hash_join->addJoinedBlock(current_block, /* check_limits = */ false); if (!hasMemoryOverflow(hash_join)) @@ -662,6 +654,7 @@ void GraceHashJoin::addJoinedBlockImpl(Block block) current_block = {}; auto right_blocks = hash_join->releaseJoinedBlocks(/* restructure */ false); + hash_join = nullptr; buckets_snapshot = rehashBuckets(buckets_snapshot.size() * 2); @@ -681,7 +674,7 @@ void GraceHashJoin::addJoinedBlockImpl(Block block) current_block = concatenateBlocks(current_blocks); } - hash_join = makeInMemoryJoin(prev_keys_num); + hash_join = makeInMemoryJoin(); if (current_block.rows() > 0) hash_join->addJoinedBlock(current_block, /* check_limits = */ false); diff --git a/src/Interpreters/GraceHashJoin.h b/src/Interpreters/GraceHashJoin.h index ec611f373ed..eb39ee09208 100644 --- a/src/Interpreters/GraceHashJoin.h +++ b/src/Interpreters/GraceHashJoin.h @@ -90,8 +90,7 @@ public: private: void initBuckets(); /// Create empty join for in-memory processing. - /// reserve_num for reserving space in hash table. - InMemoryJoinPtr makeInMemoryJoin(size_t reserve_num = 0); + InMemoryJoinPtr makeInMemoryJoin(); /// Add right table block to the @join. Calls @rehash on overflow. void addJoinedBlockImpl(Block block); diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 146b57049a6..0af33a8bd20 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -217,7 +217,7 @@ static void correctNullabilityInplace(ColumnWithTypeAndName & column, bool nulla JoinCommon::removeColumnNullability(column); } -HashJoin::HashJoin(std::shared_ptr table_join_, const Block & right_sample_block_, bool any_take_last_row_, size_t reserve_num) +HashJoin::HashJoin(std::shared_ptr table_join_, const Block & right_sample_block_, bool any_take_last_row_) : table_join(table_join_) , kind(table_join->kind()) , strictness(table_join->strictness()) @@ -302,7 +302,7 @@ HashJoin::HashJoin(std::shared_ptr table_join_, const Block & right_s } for (auto & maps : data->maps) - dataMapInit(maps, reserve_num); + dataMapInit(maps); } HashJoin::Type HashJoin::chooseMethod(JoinKind kind, const ColumnRawPtrs & key_columns, Sizes & key_sizes) @@ -454,15 +454,13 @@ struct KeyGetterForType using Type = typename KeyGetterForTypeImpl::Type; }; -void HashJoin::dataMapInit(MapsVariant & map, size_t reserve_num) +void HashJoin::dataMapInit(MapsVariant & map) { if (kind == JoinKind::Cross) return; joinDispatchInit(kind, strictness, map); joinDispatch(kind, strictness, map, [&](auto, auto, auto & map_) { map_.create(data->type); }); - if (reserve_num) - joinDispatch(kind, strictness, map, [&](auto, auto, auto & map_) { map_.reserve(data->type, reserve_num); }); } bool HashJoin::empty() const diff --git a/src/Interpreters/HashJoin.h b/src/Interpreters/HashJoin.h index 58e47432d41..50eda4482bd 100644 --- a/src/Interpreters/HashJoin.h +++ b/src/Interpreters/HashJoin.h @@ -146,7 +146,7 @@ public: class HashJoin : public IJoin { public: - HashJoin(std::shared_ptr table_join_, const Block & right_sample_block, bool any_take_last_row_ = false, size_t reserve_num = 0); + HashJoin(std::shared_ptr table_join_, const Block & right_sample_block, bool any_take_last_row_ = false); ~HashJoin() override; @@ -217,16 +217,6 @@ public: M(keys256) \ M(hashed) - /// Only for maps using hash table. - #define APPLY_FOR_HASH_JOIN_VARIANTS(M) \ - M(key32) \ - M(key64) \ - M(key_string) \ - M(key_fixed_string) \ - M(keys128) \ - M(keys256) \ - M(hashed) - /// Used for reading from StorageJoin and applying joinGet function #define APPLY_FOR_JOIN_VARIANTS_LIMITED(M) \ @@ -276,22 +266,6 @@ public: } } - void reserve(Type which, size_t num) - { - switch (which) - { - case Type::EMPTY: break; - case Type::CROSS: break; - case Type::key8: break; - case Type::key16: break; - - #define M(NAME) \ - case Type::NAME: NAME->reserve(num); break; - APPLY_FOR_HASH_JOIN_VARIANTS(M) - #undef M - } - } - size_t getTotalRowCount(Type which) const { switch (which) @@ -435,7 +409,7 @@ private: /// If set HashJoin instance is not available for modification (addJoinedBlock) TableLockHolder storage_join_lock = nullptr; - void dataMapInit(MapsVariant &, size_t); + void dataMapInit(MapsVariant &); void initRightBlockStructure(Block & saved_block_sample); From f0a1c8afa208cbc0ec7a0b056a7d2036d62e10e0 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 8 Jun 2023 09:40:41 +0000 Subject: [PATCH 0646/1072] Revert "Merge pull request #49483 from bigo-sg/grace_hash_full_join" This reverts commit fa93c388b1d17f3561c63e9265c8aefde1b2d5d0, reversing changes made to a2c0a65344f4197c6c2849f800423309fd4b22b4. --- docs/en/operations/settings/settings.md | 2 - src/Interpreters/GraceHashJoin.cpp | 55 +++++--------- src/Interpreters/GraceHashJoin.h | 3 +- .../Transforms/JoiningTransform.cpp | 73 ++----------------- src/Processors/Transforms/JoiningTransform.h | 25 +------ src/QueryPipeline/QueryPipelineBuilder.cpp | 2 +- ...01721_join_implicit_cast_long.reference.j2 | 40 ++++++++++ .../01721_join_implicit_cast_long.sql.j2 | 1 + .../02273_full_sort_join.reference.j2 | 18 ++++- .../0_stateless/02273_full_sort_join.sql.j2 | 4 +- ...274_full_sort_join_nodistinct.reference.j2 | 34 ++++++++- .../02274_full_sort_join_nodistinct.sql.j2 | 6 +- .../02275_full_sort_join_long.reference | 24 +----- .../02275_full_sort_join_long.sql.j2 | 9 ++- 14 files changed, 138 insertions(+), 158 deletions(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 4f3b4e43358..f674fe1781e 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -452,8 +452,6 @@ Possible values: The first phase of a grace join reads the right table and splits it into N buckets depending on the hash value of key columns (initially, N is `grace_hash_join_initial_buckets`). This is done in a way to ensure that each bucket can be processed independently. Rows from the first bucket are added to an in-memory hash table while the others are saved to disk. If the hash table grows beyond the memory limit (e.g., as set by [`max_bytes_in_join`](/docs/en/operations/settings/query-complexity.md/#settings-max_bytes_in_join)), the number of buckets is increased and the assigned bucket for each row. Any rows which don’t belong to the current bucket are flushed and reassigned. - Supports `INNER/LEFT/RIGHT/FULL ALL/ANY JOIN`. - - hash [Hash join algorithm](https://en.wikipedia.org/wiki/Hash_join) is used. The most generic implementation that supports all combinations of kind and strictness and multiple join keys that are combined with `OR` in the `JOIN ON` section. diff --git a/src/Interpreters/GraceHashJoin.cpp b/src/Interpreters/GraceHashJoin.cpp index 4a4c69ff473..7795061072c 100644 --- a/src/Interpreters/GraceHashJoin.cpp +++ b/src/Interpreters/GraceHashJoin.cpp @@ -304,10 +304,8 @@ void GraceHashJoin::initBuckets() bool GraceHashJoin::isSupported(const std::shared_ptr & table_join) { - bool is_asof = (table_join->strictness() == JoinStrictness::Asof); - auto kind = table_join->kind(); - return !is_asof && (isInner(kind) || isLeft(kind) || isRight(kind) || isFull(kind)) && table_join->oneDisjunct(); + return !is_asof && isInnerOrLeft(table_join->kind()) && table_join->oneDisjunct(); } GraceHashJoin::~GraceHashJoin() = default; @@ -327,6 +325,7 @@ bool GraceHashJoin::hasMemoryOverflow(size_t total_rows, size_t total_bytes) con /// One row can't be split, avoid loop if (total_rows < 2) return false; + bool has_overflow = !table_join->sizeLimits().softCheck(total_rows, total_bytes); if (has_overflow) @@ -471,30 +470,18 @@ bool GraceHashJoin::alwaysReturnsEmptySet() const return hash_join_is_empty; } -/// Each bucket are handled by the following steps -/// 1. build hash_join by the right side blocks. -/// 2. join left side with the hash_join, -/// 3. read right non-joined blocks from hash_join. -/// buckets are handled one by one, each hash_join will not be release before the right non-joined blocks are emitted. -/// -/// There is a finished counter in JoiningTransform/DelayedJoinedBlocksWorkerTransform, -/// only one processor could take the non-joined blocks from right stream, and ensure all rows from -/// left stream have been emitted before this. -IBlocksStreamPtr -GraceHashJoin::getNonJoinedBlocks(const Block & left_sample_block_, const Block & result_sample_block_, UInt64 max_block_size_) const + +IBlocksStreamPtr GraceHashJoin::getNonJoinedBlocks(const Block &, const Block &, UInt64) const { - return hash_join->getNonJoinedBlocks(left_sample_block_, result_sample_block_, max_block_size_); + /// We do no support returning non joined blocks here. + /// TODO: They _should_ be reported by getDelayedBlocks instead + return nullptr; } class GraceHashJoin::DelayedBlocks : public IBlocksStream { public: - explicit DelayedBlocks( - size_t current_bucket_, - Buckets buckets_, - InMemoryJoinPtr hash_join_, - const Names & left_key_names_, - const Names & right_key_names_) + explicit DelayedBlocks(size_t current_bucket_, Buckets buckets_, InMemoryJoinPtr hash_join_, const Names & left_key_names_, const Names & right_key_names_) : current_bucket(current_bucket_) , buckets(std::move(buckets_)) , hash_join(std::move(hash_join_)) @@ -512,15 +499,12 @@ public: do { - // One DelayedBlocks is shared among multiple DelayedJoinedBlocksWorkerTransform. - // There is a lock inside left_reader.read(). block = left_reader.read(); if (!block) { return {}; } - // block comes from left_reader, need to join with right table to get the result. Blocks blocks = JoinCommon::scatterBlockByHash(left_key_names, block, num_buckets); block = std::move(blocks[current_idx]); @@ -571,6 +555,16 @@ IBlocksStreamPtr GraceHashJoin::getDelayedBlocks() size_t bucket_idx = current_bucket->idx; + if (hash_join) + { + auto right_blocks = hash_join->releaseJoinedBlocks(/* restructure */ false); + for (auto & block : right_blocks) + { + Blocks blocks = JoinCommon::scatterBlockByHash(right_key_names, block, buckets.size()); + flushBlocksToBuckets(blocks, buckets, bucket_idx); + } + } + hash_join = makeInMemoryJoin(); for (bucket_idx = bucket_idx + 1; bucket_idx < buckets.size(); ++bucket_idx) @@ -595,6 +589,7 @@ IBlocksStreamPtr GraceHashJoin::getDelayedBlocks() LOG_TRACE(log, "Loaded bucket {} with {}(/{}) rows", bucket_idx, hash_join->getTotalRowCount(), num_rows); + return std::make_unique(current_bucket->idx, buckets, hash_join, left_key_names, right_key_names); } @@ -634,18 +629,6 @@ void GraceHashJoin::addJoinedBlockImpl(Block block) if (!hash_join) hash_join = makeInMemoryJoin(); - // buckets size has been changed in other threads. Need to scatter current_block again. - // rehash could only happen under hash_join_mutex's scope. - auto current_buckets = getCurrentBuckets(); - if (buckets_snapshot.size() != current_buckets.size()) - { - LOG_TRACE(log, "mismatch buckets size. previous:{}, current:{}", buckets_snapshot.size(), getCurrentBuckets().size()); - Blocks blocks = JoinCommon::scatterBlockByHash(right_key_names, current_block, current_buckets.size()); - flushBlocksToBuckets(blocks, current_buckets, bucket_index); - current_block = std::move(blocks[bucket_index]); - if (!current_block.rows()) - return; - } hash_join->addJoinedBlock(current_block, /* check_limits = */ false); if (!hasMemoryOverflow(hash_join)) diff --git a/src/Interpreters/GraceHashJoin.h b/src/Interpreters/GraceHashJoin.h index eb39ee09208..b8d83f4cad0 100644 --- a/src/Interpreters/GraceHashJoin.h +++ b/src/Interpreters/GraceHashJoin.h @@ -13,6 +13,7 @@ namespace DB { + class TableJoin; class HashJoin; @@ -78,7 +79,7 @@ public: bool supportTotals() const override { return false; } IBlocksStreamPtr - getNonJoinedBlocks(const Block & left_sample_block_, const Block & result_sample_block_, UInt64 max_block_size) const override; + getNonJoinedBlocks(const Block & left_sample_block, const Block & result_sample_block, UInt64 max_block_size) const override; /// Open iterator over joined blocks. /// Must be called after all @joinBlock calls. diff --git a/src/Processors/Transforms/JoiningTransform.cpp b/src/Processors/Transforms/JoiningTransform.cpp index 256ef66a27d..bba8ec6fa16 100644 --- a/src/Processors/Transforms/JoiningTransform.cpp +++ b/src/Processors/Transforms/JoiningTransform.cpp @@ -189,6 +189,7 @@ void JoiningTransform::transform(Chunk & chunk) } else block = readExecute(chunk); + auto num_rows = block.rows(); chunk.setColumns(block.getColumns(), num_rows); } @@ -310,16 +311,8 @@ void FillingRightJoinSideTransform::work() } -DelayedJoinedBlocksWorkerTransform::DelayedJoinedBlocksWorkerTransform( - Block left_header_, - Block output_header_, - size_t max_block_size_, - JoinPtr join_) - : IProcessor(InputPorts{Block()}, OutputPorts{output_header_}) - , left_header(left_header_) - , output_header(output_header_) - , max_block_size(max_block_size_) - , join(join_) +DelayedJoinedBlocksWorkerTransform::DelayedJoinedBlocksWorkerTransform(Block output_header) + : IProcessor(InputPorts{Block()}, OutputPorts{output_header}) { } @@ -373,7 +366,6 @@ IProcessor::Status DelayedJoinedBlocksWorkerTransform::prepare() if (!data.chunk.hasChunkInfo()) throw Exception(ErrorCodes::LOGICAL_ERROR, "DelayedJoinedBlocksWorkerTransform must have chunk info"); task = std::dynamic_pointer_cast(data.chunk.getChunkInfo()); - } else { @@ -394,24 +386,12 @@ void DelayedJoinedBlocksWorkerTransform::work() { if (!task) return; - Block block; - if (!left_delayed_stream_finished) - { - block = task->delayed_blocks->next(); - if (!block) - { - left_delayed_stream_finished = true; - block = nextNonJoinedBlock(); - } - } - else - { - block = nextNonJoinedBlock(); - } + Block block = task->delayed_blocks->next(); + if (!block) { - resetTask(); + task.reset(); return; } @@ -420,38 +400,6 @@ void DelayedJoinedBlocksWorkerTransform::work() output_chunk.setColumns(block.getColumns(), rows); } -void DelayedJoinedBlocksWorkerTransform::resetTask() -{ - task.reset(); - left_delayed_stream_finished = false; - setup_non_joined_stream = false; - non_joined_delayed_stream = nullptr; -} - -Block DelayedJoinedBlocksWorkerTransform::nextNonJoinedBlock() -{ - if (!setup_non_joined_stream) - { - setup_non_joined_stream = true; - // Before read from non-joined stream, all blocks in left file reader must have been joined. - // For example, in HashJoin, it may return invalid mismatch rows from non-joined stream before - // the all blocks in left file reader have been finished, since the used flags are incomplete. - // To make only one processor could read from non-joined stream seems be a easy way. - if (task && task->left_delayed_stream_finish_counter->isLast()) - { - if (!non_joined_delayed_stream) - { - non_joined_delayed_stream = join->getNonJoinedBlocks(left_header, output_header, max_block_size); - } - } - } - if (non_joined_delayed_stream) - { - return non_joined_delayed_stream->next(); - } - return {}; -} - DelayedJoinedBlocksTransform::DelayedJoinedBlocksTransform(size_t num_streams, JoinPtr join_) : IProcessor(InputPorts{}, OutputPorts(num_streams, Block())) , join(std::move(join_)) @@ -485,9 +433,6 @@ IProcessor::Status DelayedJoinedBlocksTransform::prepare() if (finished) { - // Since have memory limit, cannot handle all buckets parallelly by different - // DelayedJoinedBlocksWorkerTransform. So send the same task to all outputs. - // Wait for all DelayedJoinedBlocksWorkerTransform be idle before getting next bucket. for (auto & output : outputs) { if (output.isFinished()) @@ -503,14 +448,10 @@ IProcessor::Status DelayedJoinedBlocksTransform::prepare() if (delayed_blocks) { - // This counter is used to ensure that only the last DelayedJoinedBlocksWorkerTransform - // could read right non-joined blocks from the join. - auto left_delayed_stream_finished_counter = std::make_shared(outputs.size()); for (auto & output : outputs) { Chunk chunk; - auto task = std::make_shared(delayed_blocks, left_delayed_stream_finished_counter); - chunk.setChunkInfo(task); + chunk.setChunkInfo(std::make_shared(delayed_blocks)); output.push(std::move(chunk)); } delayed_blocks = nullptr; diff --git a/src/Processors/Transforms/JoiningTransform.h b/src/Processors/Transforms/JoiningTransform.h index 3577906b26a..e7edff40c56 100644 --- a/src/Processors/Transforms/JoiningTransform.h +++ b/src/Processors/Transforms/JoiningTransform.h @@ -116,14 +116,9 @@ class DelayedBlocksTask : public ChunkInfo public: explicit DelayedBlocksTask() : finished(true) {} - explicit DelayedBlocksTask(IBlocksStreamPtr delayed_blocks_, JoiningTransform::FinishCounterPtr left_delayed_stream_finish_counter_) - : delayed_blocks(std::move(delayed_blocks_)) - , left_delayed_stream_finish_counter(left_delayed_stream_finish_counter_) - { - } + explicit DelayedBlocksTask(IBlocksStreamPtr delayed_blocks_) : delayed_blocks(std::move(delayed_blocks_)) {} IBlocksStreamPtr delayed_blocks = nullptr; - JoiningTransform::FinishCounterPtr left_delayed_stream_finish_counter = nullptr; bool finished = false; }; @@ -152,11 +147,7 @@ private: class DelayedJoinedBlocksWorkerTransform : public IProcessor { public: - explicit DelayedJoinedBlocksWorkerTransform( - Block left_header_, - Block output_header_, - size_t max_block_size_, - JoinPtr join_); + explicit DelayedJoinedBlocksWorkerTransform(Block output_header); String getName() const override { return "DelayedJoinedBlocksWorkerTransform"; } @@ -164,20 +155,10 @@ public: void work() override; private: - Block left_header; - Block output_header; - size_t max_block_size; - JoinPtr join; DelayedBlocksTaskPtr task; Chunk output_chunk; - /// All joined and non-joined rows from left stream are emitted, only right non-joined rows are left - bool left_delayed_stream_finished = false; - bool setup_non_joined_stream = false; - IBlocksStreamPtr non_joined_delayed_stream = nullptr; - - void resetTask(); - Block nextNonJoinedBlock(); + bool finished = false; }; } diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index 764997e7b7e..a4edf107b2f 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -491,7 +491,7 @@ std::unique_ptr QueryPipelineBuilder::joinPipelinesRightLe if (delayed_root) { // Process delayed joined blocks when all JoiningTransform are finished. - auto delayed = std::make_shared(left_header, joined_header, max_block_size, join); + auto delayed = std::make_shared(joined_header); if (delayed->getInputs().size() != 1 || delayed->getOutputs().size() != 1) throw Exception(ErrorCodes::LOGICAL_ERROR, "DelayedJoinedBlocksWorkerTransform should have one input and one output"); diff --git a/tests/queries/0_stateless/01721_join_implicit_cast_long.reference.j2 b/tests/queries/0_stateless/01721_join_implicit_cast_long.reference.j2 index ae43aa7195c..e9f32087439 100644 --- a/tests/queries/0_stateless/01721_join_implicit_cast_long.reference.j2 +++ b/tests/queries/0_stateless/01721_join_implicit_cast_long.reference.j2 @@ -1,6 +1,7 @@ {% for join_algorithm in ['hash', 'partial_merge', 'auto', 'full_sorting_merge', 'grace_hash'] -%} === {{ join_algorithm }} === = full = +{% if join_algorithm not in ['grace_hash'] -%} -4 0 196 -3 0 197 -2 0 198 @@ -16,6 +17,7 @@ 8 108 \N 9 109 \N 10 110 \N +{% endif -%} = left = 1 101 201 2 102 202 @@ -28,6 +30,7 @@ 9 109 \N 10 110 \N = right = +{% if join_algorithm not in ['grace_hash'] -%} -4 0 196 -3 0 197 -2 0 198 @@ -38,6 +41,7 @@ 3 103 203 4 104 204 5 105 205 +{% endif -%} = inner = 1 101 201 2 102 202 @@ -45,6 +49,7 @@ 4 104 204 5 105 205 = full = +{% if join_algorithm not in ['grace_hash'] -%} 0 0 -4 0 0 -3 0 0 -2 @@ -60,6 +65,7 @@ 8 8 0 9 9 0 10 10 0 +{% endif -%} = left = 1 1 1 2 2 2 @@ -72,6 +78,7 @@ 9 9 0 10 10 0 = right = +{% if join_algorithm not in ['grace_hash'] -%} 0 0 -4 0 0 -3 0 0 -2 @@ -82,6 +89,7 @@ 3 3 3 4 4 4 5 5 5 +{% endif -%} = inner = 1 1 1 2 2 2 @@ -90,6 +98,7 @@ 5 5 5 = join on = = full = +{% if join_algorithm not in ['grace_hash'] -%} 0 0 -4 196 0 0 -3 197 0 0 -2 198 @@ -105,6 +114,7 @@ 8 108 0 \N 9 109 0 \N 10 110 0 \N +{% endif -%} = left = 1 101 1 201 2 102 2 202 @@ -117,6 +127,7 @@ 9 109 0 \N 10 110 0 \N = right = +{% if join_algorithm not in ['grace_hash'] -%} 0 0 -4 196 0 0 -3 197 0 0 -2 198 @@ -127,6 +138,7 @@ 3 103 3 203 4 104 4 204 5 105 5 205 +{% endif -%} = inner = 1 101 1 201 2 102 2 202 @@ -134,6 +146,7 @@ 4 104 4 204 5 105 5 205 = full = +{% if join_algorithm not in ['grace_hash'] -%} 0 0 -4 196 0 0 -3 197 0 0 -2 198 @@ -149,6 +162,7 @@ 8 108 0 \N 9 109 0 \N 10 110 0 \N +{% endif -%} = left = 1 101 1 201 2 102 2 202 @@ -161,6 +175,7 @@ 9 109 0 \N 10 110 0 \N = right = +{% if join_algorithm not in ['grace_hash'] -%} 0 0 -4 196 0 0 -3 197 0 0 -2 198 @@ -171,6 +186,7 @@ 3 103 3 203 4 104 4 204 5 105 5 205 +{% endif -%} = inner = 1 101 1 201 2 102 2 202 @@ -180,6 +196,7 @@ = agg = 1 1 +{% if join_algorithm not in ['grace_hash'] -%} 1 1 1 @@ -188,11 +205,13 @@ 1 55 1055 0 0 -10 0 990 1 55 15 1055 1015 +{% endif -%} = types = 1 1 1 1 +{% if join_algorithm not in ['grace_hash'] -%} 1 1 1 @@ -200,9 +219,11 @@ 1 1 1 +{% endif -%} {% if join_algorithm not in ['full_sorting_merge'] -%} === join use nulls === = full = +{% if join_algorithm not in ['grace_hash'] -%} -4 \N 196 -3 \N 197 -2 \N 198 @@ -218,6 +239,7 @@ 8 108 \N 9 109 \N 10 110 \N +{% endif -%} = left = 1 101 201 2 102 202 @@ -230,6 +252,7 @@ 9 109 \N 10 110 \N = right = +{% if join_algorithm not in ['grace_hash'] -%} -4 \N 196 -3 \N 197 -2 \N 198 @@ -240,6 +263,7 @@ 3 103 203 4 104 204 5 105 205 +{% endif -%} = inner = 1 101 201 2 102 202 @@ -247,6 +271,7 @@ 4 104 204 5 105 205 = full = +{% if join_algorithm not in ['grace_hash'] -%} 1 1 1 2 2 2 3 3 3 @@ -262,6 +287,7 @@ \N \N -2 \N \N -1 \N \N 0 +{% endif -%} = left = 1 1 1 2 2 2 @@ -274,6 +300,7 @@ 9 9 \N 10 10 \N = right = +{% if join_algorithm not in ['grace_hash'] -%} 1 1 1 2 2 2 3 3 3 @@ -284,6 +311,7 @@ \N \N -2 \N \N -1 \N \N 0 +{% endif -%} = inner = 1 1 1 2 2 2 @@ -292,6 +320,7 @@ 5 5 5 = join on = = full = +{% if join_algorithm not in ['grace_hash'] -%} 1 101 1 201 2 102 2 202 3 103 3 203 @@ -307,6 +336,7 @@ \N \N -2 198 \N \N -1 199 \N \N 0 200 +{% endif -%} = left = 1 101 1 201 2 102 2 202 @@ -319,6 +349,7 @@ 9 109 \N \N 10 110 \N \N = right = +{% if join_algorithm not in ['grace_hash'] -%} 1 101 1 201 2 102 2 202 3 103 3 203 @@ -329,6 +360,7 @@ \N \N -2 198 \N \N -1 199 \N \N 0 200 +{% endif -%} = inner = 1 101 1 201 2 102 2 202 @@ -336,6 +368,7 @@ 4 104 4 204 5 105 5 205 = full = +{% if join_algorithm not in ['grace_hash'] -%} 1 101 1 201 2 102 2 202 3 103 3 203 @@ -351,6 +384,7 @@ \N \N -2 198 \N \N -1 199 \N \N 0 200 +{% endif -%} = left = 1 101 1 201 2 102 2 202 @@ -363,6 +397,7 @@ 9 109 \N \N 10 110 \N \N = right = +{% if join_algorithm not in ['grace_hash'] -%} 1 101 1 201 2 102 2 202 3 103 3 203 @@ -373,6 +408,7 @@ \N \N -2 198 \N \N -1 199 \N \N 0 200 +{% endif -%} = inner = 1 101 1 201 2 102 2 202 @@ -382,6 +418,7 @@ = agg = 1 1 +{% if join_algorithm not in ['grace_hash'] -%} 1 1 1 @@ -390,11 +427,13 @@ 1 55 1055 1 55 15 1055 1015 \N \N -10 \N 990 +{% endif -%} = types = 1 1 1 1 +{% if join_algorithm not in ['grace_hash'] -%} 1 1 1 @@ -403,4 +442,5 @@ 1 1 {% endif -%} +{% endif -%} {% endfor -%} diff --git a/tests/queries/0_stateless/01721_join_implicit_cast_long.sql.j2 b/tests/queries/0_stateless/01721_join_implicit_cast_long.sql.j2 index 38f71f4c5ec..f5321939f28 100644 --- a/tests/queries/0_stateless/01721_join_implicit_cast_long.sql.j2 +++ b/tests/queries/0_stateless/01721_join_implicit_cast_long.sql.j2 @@ -10,6 +10,7 @@ INSERT INTO t1 SELECT number as a, 100 + number as b FROM system.numbers LIMIT 1 INSERT INTO t2 SELECT number - 5 as a, 200 + number - 5 as b FROM system.numbers LIMIT 1, 10; {% macro is_implemented(join_algorithm) -%} +{% if join_algorithm == 'grace_hash' %} -- { serverError NOT_IMPLEMENTED } {% endif %} {% endmacro -%} {% for join_algorithm in ['hash', 'partial_merge', 'auto', 'full_sorting_merge', 'grace_hash'] -%} diff --git a/tests/queries/0_stateless/02273_full_sort_join.reference.j2 b/tests/queries/0_stateless/02273_full_sort_join.reference.j2 index 0af4158e971..98bfd9d9b2b 100644 --- a/tests/queries/0_stateless/02273_full_sort_join.reference.j2 +++ b/tests/queries/0_stateless/02273_full_sort_join.reference.j2 @@ -1,7 +1,7 @@ {% set table_size = 15 -%} {% for join_algorithm in ['default', 'full_sorting_merge', 'grace_hash'] -%} -- {{ join_algorithm }} -- -{% for block_size in range(1, table_size + 1, 4) -%} +{% for block_size in range(1, table_size + 1) -%} ALL INNER USING | bs = {{ block_size }} 4 0 0 5 0 0 @@ -50,6 +50,7 @@ ALL LEFT | bs = {{ block_size }} 14 14 val9 0 14 14 val9 0 ALL RIGHT | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 4 4 0 val10 5 5 0 val6 6 6 0 val8 @@ -63,6 +64,7 @@ ALL RIGHT | bs = {{ block_size }} 13 13 0 val9 14 14 0 val3 14 14 0 val7 +{% endif -%} ALL INNER | bs = {{ block_size }} | copmosite key 2 2 2 2 2 2 0 0 2 2 2 2 2 2 0 0 @@ -83,6 +85,7 @@ ALL LEFT | bs = {{ block_size }} | copmosite key 2 2 2 2 2 2 val12 0 2 2 2 2 2 2 val9 0 ALL RIGHT | bs = {{ block_size }} | copmosite key +{% if join_algorithm != 'grace_hash' -%} 0 \N 0 1 1 1 1 val2 0 \N 0 1 1 1 1 val7 0 \N 0 1 1 2 1 val5 @@ -96,6 +99,7 @@ ALL RIGHT | bs = {{ block_size }} | copmosite key 0 \N 0 2 2 \N 1 val9 2 2 2 2 2 2 0 val4 2 2 2 2 2 2 0 val4 +{% endif -%} ANY INNER USING | bs = {{ block_size }} 4 0 0 5 0 0 @@ -133,6 +137,7 @@ ANY LEFT | bs = {{ block_size }} 13 13 val13 0 14 14 val9 0 ANY RIGHT | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 4 4 0 val10 5 5 0 val6 6 6 0 val8 @@ -145,6 +150,7 @@ ANY RIGHT | bs = {{ block_size }} 13 13 0 val9 14 14 0 val3 14 14 0 val7 +{% endif -%} ANY INNER | bs = {{ block_size }} | copmosite key 2 2 2 2 2 2 0 0 ANY LEFT | bs = {{ block_size }} | copmosite key @@ -164,6 +170,7 @@ ANY LEFT | bs = {{ block_size }} | copmosite key 2 2 2 2 2 2 val12 0 2 2 2 2 2 2 val9 0 ANY RIGHT | bs = {{ block_size }} | copmosite key +{% if join_algorithm != 'grace_hash' -%} 0 \N 0 1 1 1 1 val2 0 \N 0 1 1 1 1 val7 0 \N 0 1 1 2 1 val5 @@ -176,6 +183,7 @@ ANY RIGHT | bs = {{ block_size }} | copmosite key 0 \N 0 2 1 \N 1 val3 0 \N 0 2 2 \N 1 val9 2 2 2 2 2 2 0 val4 +{% endif -%} {% endfor -%} ALL INNER | join_use_nulls = 1 4 4 0 0 @@ -211,6 +219,7 @@ ALL LEFT | join_use_nulls = 1 14 14 val9 0 14 14 val9 0 ALL RIGHT | join_use_nulls = 1 +{% if join_algorithm != 'grace_hash' -%} 4 4 0 val10 5 5 0 val6 6 6 0 val8 @@ -224,6 +233,7 @@ ALL RIGHT | join_use_nulls = 1 13 13 0 val9 14 14 0 val3 14 14 0 val7 +{% endif -%} ALL INNER | join_use_nulls = 1 | copmosite key 2 2 2 2 2 2 0 0 2 2 2 2 2 2 0 0 @@ -244,6 +254,7 @@ ALL LEFT | join_use_nulls = 1 | copmosite key 2 2 2 2 2 2 val12 0 2 2 2 2 2 2 val9 0 ALL RIGHT | join_use_nulls = 1 | copmosite key +{% if join_algorithm != 'grace_hash' -%} 2 2 2 2 2 2 0 val4 2 2 2 2 2 2 0 val4 \N \N \N 1 1 1 \N val2 @@ -257,6 +268,7 @@ ALL RIGHT | join_use_nulls = 1 | copmosite key \N \N \N 2 1 2 \N val8 \N \N \N 2 1 \N \N val3 \N \N \N 2 2 \N \N val9 +{% endif -%} ANY INNER | join_use_nulls = 1 4 4 0 0 5 5 0 0 @@ -284,6 +296,7 @@ ANY LEFT | join_use_nulls = 1 13 13 val13 0 14 14 val9 0 ANY RIGHT | join_use_nulls = 1 +{% if join_algorithm != 'grace_hash' -%} 4 4 0 val10 5 5 0 val6 6 6 0 val8 @@ -296,6 +309,7 @@ ANY RIGHT | join_use_nulls = 1 13 13 0 val9 14 14 0 val3 14 14 0 val7 +{% endif -%} ANY INNER | join_use_nulls = 1 | copmosite key 2 2 2 2 2 2 0 0 ANY LEFT | join_use_nulls = 1 | copmosite key @@ -315,6 +329,7 @@ ANY LEFT | join_use_nulls = 1 | copmosite key 2 2 2 2 2 2 val12 0 2 2 2 2 2 2 val9 0 ANY RIGHT | join_use_nulls = 1 | copmosite key +{% if join_algorithm != 'grace_hash' -%} 2 2 2 2 2 2 0 val4 \N \N \N 1 1 1 \N val2 \N \N \N 1 1 1 \N val7 @@ -327,4 +342,5 @@ ANY RIGHT | join_use_nulls = 1 | copmosite key \N \N \N 2 1 2 \N val8 \N \N \N 2 1 \N \N val3 \N \N \N 2 2 \N \N val9 +{% endif -%} {% endfor -%} diff --git a/tests/queries/0_stateless/02273_full_sort_join.sql.j2 b/tests/queries/0_stateless/02273_full_sort_join.sql.j2 index 6500306356c..8b739330364 100644 --- a/tests/queries/0_stateless/02273_full_sort_join.sql.j2 +++ b/tests/queries/0_stateless/02273_full_sort_join.sql.j2 @@ -26,7 +26,9 @@ INSERT INTO t2 'val' || toString(number) as s FROM numbers_mt({{ table_size - 3 }}); + {% macro is_implemented(join_algorithm) -%} +{% if join_algorithm == 'grace_hash' %} -- { serverError NOT_IMPLEMENTED } {% endif %} {% endmacro -%} {% for join_algorithm in ['default', 'full_sorting_merge', 'grace_hash'] -%} @@ -36,7 +38,7 @@ SET max_bytes_in_join = '{% if join_algorithm == 'grace_hash' %}10K{% else %}0{% SELECT '-- {{ join_algorithm }} --'; SET join_algorithm = '{{ join_algorithm }}'; -{% for block_size in range(1, table_size + 1, 4) -%} +{% for block_size in range(1, table_size + 1) -%} {% for kind in ['ALL', 'ANY'] -%} SET max_block_size = {{ block_size }}; diff --git a/tests/queries/0_stateless/02274_full_sort_join_nodistinct.reference.j2 b/tests/queries/0_stateless/02274_full_sort_join_nodistinct.reference.j2 index df968e86e8d..2cc6c6e85d6 100644 --- a/tests/queries/0_stateless/02274_full_sort_join_nodistinct.reference.j2 +++ b/tests/queries/0_stateless/02274_full_sort_join_nodistinct.reference.j2 @@ -1,6 +1,6 @@ {% for join_algorithm in ['full_sorting_merge', 'grace_hash'] -%} --- {{ join_algorithm }} --- -{% for block_size in range(1, 11, 4) -%} +{% for block_size in range(1, 11) -%} t1 ALL INNER JOIN t2 | bs = {{ block_size }} 1 1 4 5 1 1 4 5 @@ -108,6 +108,7 @@ t1 ALL LEFT JOIN t2 | bs = {{ block_size }} 2 2 val27 5 3 3 val3 4 t1 ALL RIGHT JOIN t2 | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 1 1 4 val11 1 1 4 val12 2 2 5 val22 @@ -160,6 +161,7 @@ t1 ALL RIGHT JOIN t2 | bs = {{ block_size }} 2 2 5 val28 2 2 5 val28 3 3 4 val3 +{% endif -%} t1 ANY INNER JOIN t2 | bs = {{ block_size }} 1 1 4 5 2 2 5 5 @@ -175,6 +177,7 @@ t1 ANY LEFT JOIN t2 | bs = {{ block_size }} 2 2 val27 5 3 3 val3 4 t1 ANY RIGHT JOIN t2 | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 1 1 4 val11 1 1 4 val12 2 2 5 val22 @@ -185,7 +188,9 @@ t1 ANY RIGHT JOIN t2 | bs = {{ block_size }} 2 2 5 val27 2 2 5 val28 3 3 4 val3 +{% endif -%} t1 ALL FULL JOIN t2 | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 1 1 4 5 1 1 4 5 2 2 5 5 @@ -238,7 +243,9 @@ t1 ALL FULL JOIN t2 | bs = {{ block_size }} 2 2 5 5 2 2 5 5 3 3 4 4 +{% endif -%} t1 ALL FULL JOIN USING t2 | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 1 4 5 1 4 5 2 5 5 @@ -291,6 +298,7 @@ t1 ALL FULL JOIN USING t2 | bs = {{ block_size }} 2 5 5 2 5 5 3 4 4 +{% endif -%} t1 ALL INNER JOIN tn2 | bs = {{ block_size }} 1 1 4 5 1 1 4 5 @@ -307,6 +315,7 @@ t1 ALL LEFT JOIN tn2 | bs = {{ block_size }} 2 \N val27 0 3 3 val3 4 t1 ALL RIGHT JOIN tn2 | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 0 \N 0 val22 0 \N 0 val23 0 \N 0 val24 @@ -317,6 +326,7 @@ t1 ALL RIGHT JOIN tn2 | bs = {{ block_size }} 1 1 4 val11 1 1 4 val12 3 3 4 val3 +{% endif -%} t1 ANY INNER JOIN tn2 | bs = {{ block_size }} 1 1 4 5 3 3 4 4 @@ -331,6 +341,7 @@ t1 ANY LEFT JOIN tn2 | bs = {{ block_size }} 2 \N val27 0 3 3 val3 4 t1 ANY RIGHT JOIN tn2 | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 0 \N 0 val22 0 \N 0 val23 0 \N 0 val24 @@ -341,7 +352,9 @@ t1 ANY RIGHT JOIN tn2 | bs = {{ block_size }} 1 1 4 val11 1 1 4 val12 3 3 4 val3 +{% endif -%} t1 ALL FULL JOIN tn2 | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 0 \N 0 5 0 \N 0 5 0 \N 0 5 @@ -359,8 +372,9 @@ t1 ALL FULL JOIN tn2 | bs = {{ block_size }} 2 \N 5 0 2 \N 5 0 3 3 4 4 -{% if join_algorithm != 'grace_hash' -%} +{% endif -%} t1 ALL FULL JOIN USING tn2 | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 1 4 5 1 4 5 2 5 0 @@ -395,6 +409,7 @@ tn1 ALL LEFT JOIN t2 | bs = {{ block_size }} \N 0 val26 0 \N 0 val27 0 tn1 ALL RIGHT JOIN t2 | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 1 1 4 val11 1 1 4 val12 3 3 4 val3 @@ -405,6 +420,7 @@ tn1 ALL RIGHT JOIN t2 | bs = {{ block_size }} \N 2 0 val26 \N 2 0 val27 \N 2 0 val28 +{% endif -%} tn1 ANY INNER JOIN t2 | bs = {{ block_size }} 1 1 4 5 3 3 4 4 @@ -419,6 +435,7 @@ tn1 ANY LEFT JOIN t2 | bs = {{ block_size }} \N 0 val26 0 \N 0 val27 0 tn1 ANY RIGHT JOIN t2 | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 1 1 4 val11 1 1 4 val12 3 3 4 val3 @@ -429,7 +446,9 @@ tn1 ANY RIGHT JOIN t2 | bs = {{ block_size }} \N 2 0 val26 \N 2 0 val27 \N 2 0 val28 +{% endif -%} tn1 ALL FULL JOIN t2 | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 1 1 4 5 1 1 4 5 3 3 4 4 @@ -447,7 +466,9 @@ tn1 ALL FULL JOIN t2 | bs = {{ block_size }} \N 2 0 5 \N 2 0 5 \N 2 0 5 +{% endif -%} tn1 ALL FULL JOIN USING t2 | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 1 4 5 1 4 5 2 0 5 @@ -465,6 +486,7 @@ tn1 ALL FULL JOIN USING t2 | bs = {{ block_size }} \N 5 0 \N 5 0 \N 5 0 +{% endif -%} tn1 ALL INNER JOIN tn2 | bs = {{ block_size }} 1 1 4 5 1 1 4 5 @@ -481,6 +503,7 @@ tn1 ALL LEFT JOIN tn2 | bs = {{ block_size }} \N \N val26 0 \N \N val27 0 tn1 ALL RIGHT JOIN tn2 | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 1 1 4 val11 1 1 4 val12 3 3 4 val3 @@ -491,6 +514,7 @@ tn1 ALL RIGHT JOIN tn2 | bs = {{ block_size }} \N \N 0 val26 \N \N 0 val27 \N \N 0 val28 +{% endif -%} tn1 ANY INNER JOIN tn2 | bs = {{ block_size }} 1 1 4 5 3 3 4 4 @@ -505,6 +529,7 @@ tn1 ANY LEFT JOIN tn2 | bs = {{ block_size }} \N \N val26 0 \N \N val27 0 tn1 ANY RIGHT JOIN tn2 | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 1 1 4 val11 1 1 4 val12 3 3 4 val3 @@ -515,7 +540,9 @@ tn1 ANY RIGHT JOIN tn2 | bs = {{ block_size }} \N \N 0 val26 \N \N 0 val27 \N \N 0 val28 +{% endif -%} tn1 ALL FULL JOIN tn2 | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 1 1 4 5 1 1 4 5 3 3 4 4 @@ -533,8 +560,9 @@ tn1 ALL FULL JOIN tn2 | bs = {{ block_size }} \N \N 5 0 \N \N 5 0 \N \N 5 0 -{% if join_algorithm != 'grace_hash' -%} +{% endif -%} tn1 ALL FULL JOIN USING tn2 | bs = {{ block_size }} +{% if join_algorithm != 'grace_hash' -%} 1 4 5 1 4 5 3 4 4 diff --git a/tests/queries/0_stateless/02274_full_sort_join_nodistinct.sql.j2 b/tests/queries/0_stateless/02274_full_sort_join_nodistinct.sql.j2 index f8eb4b1a53e..613da65421e 100644 --- a/tests/queries/0_stateless/02274_full_sort_join_nodistinct.sql.j2 +++ b/tests/queries/0_stateless/02274_full_sort_join_nodistinct.sql.j2 @@ -16,6 +16,7 @@ INSERT INTO t2 VALUES (1, 'val11'), (1, 'val12'), (2, 'val22'), (2, 'val23'), (2 INSERT INTO tn2 VALUES (1, 'val11'), (1, 'val12'), (NULL, 'val22'), (NULL, 'val23'), (NULL, 'val24'), (NULL, 'val25'), (NULL, 'val26'), (NULL, 'val27'), (NULL, 'val28'), (3, 'val3'); {% macro is_implemented(join_algorithm) -%} +{% if join_algorithm == 'grace_hash' %} -- { serverError NOT_IMPLEMENTED } {% endif %} {% endmacro -%} {% for join_algorithm in ['full_sorting_merge', 'grace_hash'] -%} @@ -26,7 +27,7 @@ SET join_algorithm = '{{ join_algorithm }}'; SELECT '--- {{ join_algorithm }} ---'; -{% for block_size in range(1, 11, 4) -%} +{% for block_size in range(1, 11) -%} SET max_block_size = {{ block_size }}; {% for t1, t2 in [('t1', 't2'), ('t1', 'tn2'), ('tn1', 't2'), ('tn1', 'tn2')] -%} @@ -46,10 +47,9 @@ SELECT t1.key, t2.key, length(t1.s), t2.s FROM {{ t1 }} AS t1 {{ kind }} RIGHT J SELECT '{{ t1 }} ALL FULL JOIN {{ t2 }} | bs = {{ block_size }}'; SELECT t1.key, t2.key, length(t1.s), length(t2.s) FROM {{ t1 }} AS t1 {{ kind }} FULL JOIN {{ t2 }} AS t2 ON t1.key == t2.key ORDER BY t1.key, t2.key, length(t1.s), length(t2.s); {{ is_implemented(join_algorithm) }} -{% if join_algorithm == 'full_sorting_merge' or t2 != 'tn2' -%} SELECT '{{ t1 }} ALL FULL JOIN USING {{ t2 }} | bs = {{ block_size }}'; SELECT key, length(t1.s), length(t2.s) FROM {{ t1 }} AS t1 ALL FULL JOIN {{ t2 }} AS t2 USING (key) ORDER BY key, length(t1.s), length(t2.s); {{ is_implemented(join_algorithm) }} -{% endif -%} + {% endfor -%} {% endfor -%} SET max_bytes_in_join = 0; diff --git a/tests/queries/0_stateless/02275_full_sort_join_long.reference b/tests/queries/0_stateless/02275_full_sort_join_long.reference index 73482358d12..9ec06aea3e6 100644 --- a/tests/queries/0_stateless/02275_full_sort_join_long.reference +++ b/tests/queries/0_stateless/02275_full_sort_join_long.reference @@ -41,34 +41,16 @@ ALL INNER ALL LEFT 50195752660639 500353531835 10369589 10369589 1000342 ALL RIGHT -500353531835 684008812186 1367170 1000342 1367170 +skipped ALL INNER 500353531835 500353531835 1000342 1000342 1000342 ALL LEFT 50195752660639 500353531835 10369589 10369589 1000342 ALL RIGHT -500353531835 684008812186 1367170 1000342 1367170 +skipped ALL INNER 500353531835 500353531835 1000342 1000342 1000342 ALL LEFT 50195752660639 500353531835 10369589 10369589 1000342 ALL RIGHT -500353531835 684008812186 1367170 1000342 1367170 -ANY INNER -199622811843 199622811843 399458 399458 399458 -ANY LEFT -50010619420459 315220291655 10000000 10000000 630753 -ANY RIGHT -316611844056 500267124407 1000000 633172 1000000 -ANY INNER -199622811843 199622811843 399458 399458 399458 -ANY LEFT -50010619420459 315220291655 10000000 10000000 630753 -ANY RIGHT -316611844056 500267124407 1000000 633172 1000000 -ANY INNER -199622811843 199622811843 399458 399458 399458 -ANY LEFT -50010619420459 315220291655 10000000 10000000 630753 -ANY RIGHT -316611844056 500267124407 1000000 633172 1000000 +skipped diff --git a/tests/queries/0_stateless/02275_full_sort_join_long.sql.j2 b/tests/queries/0_stateless/02275_full_sort_join_long.sql.j2 index 0b28fd67050..98cc46c9cb4 100644 --- a/tests/queries/0_stateless/02275_full_sort_join_long.sql.j2 +++ b/tests/queries/0_stateless/02275_full_sort_join_long.sql.j2 @@ -22,6 +22,11 @@ INSERT INTO t2 FROM numbers_mt({{ rtable_size }}) ; +{% macro is_implemented(join_algorithm) -%} +{% if join_algorithm == 'grace_hash' %} -- { serverError NOT_IMPLEMENTED } +SELECT 'skipped'; +{% endif -%} +{% endmacro -%} {% for join_algorithm in ['full_sorting_merge', 'grace_hash'] -%} @@ -35,6 +40,7 @@ SET join_algorithm = '{{ join_algorithm }}'; SET max_block_size = {{ block_size }}; +{% if not (kind == 'ANY' and join_algorithm == 'grace_hash') -%} SELECT '{{ kind }} INNER'; SELECT sum(t1.key), sum(t2.key), count(), countIf(t1.key != 0), countIf(t2.key != 0) FROM t1 @@ -52,8 +58,9 @@ SELECT '{{ kind }} RIGHT'; SELECT sum(t1.key), sum(t2.key), count(), countIf(t1.key != 0), countIf(t2.key != 0) FROM t1 {{ kind }} RIGHT JOIN t2 ON t1.key == t2.key -; +; {{ is_implemented(join_algorithm) }} +{% endif -%} {% endfor -%} {% endfor -%} From 44c68ffdab34642192f16fb1f9ecb9bf96bdd73b Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 8 Jun 2023 10:17:02 +0000 Subject: [PATCH 0647/1072] Add config param allow_remove_stale_moving_parts --- src/Storages/MergeTree/MergeTreeData.cpp | 15 +++++++++++++-- src/Storages/MergeTree/MergeTreeData.h | 3 +++ src/Storages/MergeTree/MergeTreePartsMover.cpp | 7 ++++--- .../configs/config.d/storage_conf.xml | 2 ++ .../configs/remote_servers.xml | 1 + .../test_encrypted_disk/configs/storage.xml | 1 + .../configs/config.xml | 1 + .../configs/config.d/storage_conf.xml | 1 + .../test_merge_tree_s3/configs/config.xml | 1 + .../configs/config.d/storage_configuration.xml | 1 + .../configs/config.d/storage_configuration.xml | 1 + .../configs/config.d/storage_configuration.xml | 1 + .../configs/config.d/storage_conf.xml | 1 + .../configs/config.d/s3.xml | 1 + .../configs/config.d/storage_configuration.xml | 1 + .../test_zero_copy_fetch/configs/storage_conf.xml | 1 + 16 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 7fe3efaf6d5..9ce4e55e341 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -1998,14 +1999,19 @@ static bool isOldPartDirectory(const DiskPtr & disk, const String & directory_pa return true; } + size_t MergeTreeData::clearOldTemporaryDirectories(size_t custom_directories_lifetime_seconds, const NameSet & valid_prefixes) { size_t cleared_count = 0; cleared_count += clearOldTemporaryDirectories(relative_data_path, custom_directories_lifetime_seconds, valid_prefixes); - /// Clear _all_ parts from the `moving` directory - cleared_count += clearOldTemporaryDirectories(fs::path(relative_data_path) / "moving", custom_directories_lifetime_seconds, {""}); + if (allowRemoveStaleMovingParts()) + { + /// Clear _all_ parts from the `moving` directory + cleared_count += clearOldTemporaryDirectories(fs::path(relative_data_path) / "moving", custom_directories_lifetime_seconds, {""}); + } + return cleared_count; } @@ -8412,6 +8418,11 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::createEmptyPart( return new_data_part; } +bool MergeTreeData::allowRemoveStaleMovingParts() const +{ + return ConfigHelper::getBool(getContext()->getConfigRef(), "allow_remove_stale_moving_parts"); +} + CurrentlySubmergingEmergingTagger::~CurrentlySubmergingEmergingTagger() { std::lock_guard lock(storage.currently_submerging_emerging_mutex); diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 444bd8f47ac..8755ffebd7e 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -1060,6 +1060,9 @@ public: void waitForOutdatedPartsToBeLoaded() const; bool canUsePolymorphicParts() const; + /// TODO: make enabled by default in the next release if no problems found. + bool allowRemoveStaleMovingParts() const; + protected: friend class IMergeTreeDataPart; friend class MergeTreeDataMergerMutator; diff --git a/src/Storages/MergeTree/MergeTreePartsMover.cpp b/src/Storages/MergeTree/MergeTreePartsMover.cpp index 656167de986..8fa4ac6c78a 100644 --- a/src/Storages/MergeTree/MergeTreePartsMover.cpp +++ b/src/Storages/MergeTree/MergeTreePartsMover.cpp @@ -250,7 +250,7 @@ MergeTreePartsMover::TemporaryClonedPart MergeTreePartsMover::clonePart(const Me cloned_part.part = std::move(builder).withPartFormatFromDisk().build(); LOG_TRACE(log, "Part {} was cloned to {}", part->name, cloned_part.part->getDataPartStorage().getFullPath()); - cloned_part.part->is_temp = true; + cloned_part.part->is_temp = data->allowRemoveStaleMovingParts(); cloned_part.part->loadColumnsChecksumsIndexes(true, true); cloned_part.part->loadVersionMetadata(); cloned_part.part->modification_time = cloned_part.part->getDataPartStorage().getLastModified().epochTime(); @@ -270,10 +270,11 @@ void MergeTreePartsMover::swapClonedPart(TemporaryClonedPart & cloned_part) cons { LOG_INFO(log, "Failed to swap {}. Active part doesn't exist (containing part {}). " - "Possible it was merged or mutated. Will remove copy on path '{}'", + "Possible it was merged or mutated. Part on path '{}' {}", cloned_part.part->name, active_part ? active_part->name : "doesn't exist", - cloned_part.part->getDataPartStorage().getFullPath()); + cloned_part.part->getDataPartStorage().getFullPath(), + data->allowRemoveStaleMovingParts() ? "will be removed" : "will remain intact (set in config.xml, exercise caution when using)"); return; } diff --git a/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf.xml b/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf.xml index f6898ed1d7e..1450a459257 100644 --- a/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf.xml +++ b/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf.xml @@ -21,4 +21,6 @@ + + true diff --git a/tests/integration/test_consistant_parts_after_move_partition/configs/remote_servers.xml b/tests/integration/test_consistant_parts_after_move_partition/configs/remote_servers.xml index 3d4ccd584b1..f22b7dc4032 100644 --- a/tests/integration/test_consistant_parts_after_move_partition/configs/remote_servers.xml +++ b/tests/integration/test_consistant_parts_after_move_partition/configs/remote_servers.xml @@ -16,4 +16,5 @@ + true diff --git a/tests/integration/test_encrypted_disk/configs/storage.xml b/tests/integration/test_encrypted_disk/configs/storage.xml index 1e48c80d50f..2b84e0d6daa 100644 --- a/tests/integration/test_encrypted_disk/configs/storage.xml +++ b/tests/integration/test_encrypted_disk/configs/storage.xml @@ -105,4 +105,5 @@ + true diff --git a/tests/integration/test_merge_tree_azure_blob_storage/configs/config.xml b/tests/integration/test_merge_tree_azure_blob_storage/configs/config.xml index feb537ebbce..a6e0d26f695 100644 --- a/tests/integration/test_merge_tree_azure_blob_storage/configs/config.xml +++ b/tests/integration/test_merge_tree_azure_blob_storage/configs/config.xml @@ -15,4 +15,5 @@ 500 ./clickhouse/ users.xml + true diff --git a/tests/integration/test_merge_tree_hdfs/configs/config.d/storage_conf.xml b/tests/integration/test_merge_tree_hdfs/configs/config.d/storage_conf.xml index 890c396ed95..858d77e9ea0 100644 --- a/tests/integration/test_merge_tree_hdfs/configs/config.d/storage_conf.xml +++ b/tests/integration/test_merge_tree_hdfs/configs/config.d/storage_conf.xml @@ -29,4 +29,5 @@ 0 + true diff --git a/tests/integration/test_merge_tree_s3/configs/config.xml b/tests/integration/test_merge_tree_s3/configs/config.xml index 314f23f5788..a25da96215e 100644 --- a/tests/integration/test_merge_tree_s3/configs/config.xml +++ b/tests/integration/test_merge_tree_s3/configs/config.xml @@ -8,4 +8,5 @@ true + true diff --git a/tests/integration/test_move_partition_to_disk_on_cluster/configs/config.d/storage_configuration.xml b/tests/integration/test_move_partition_to_disk_on_cluster/configs/config.d/storage_configuration.xml index 3289186c175..cd2f0867c61 100644 --- a/tests/integration/test_move_partition_to_disk_on_cluster/configs/config.d/storage_configuration.xml +++ b/tests/integration/test_move_partition_to_disk_on_cluster/configs/config.d/storage_configuration.xml @@ -24,5 +24,6 @@ +true diff --git a/tests/integration/test_multiple_disks/configs/config.d/storage_configuration.xml b/tests/integration/test_multiple_disks/configs/config.d/storage_configuration.xml index e7a87fb77b1..033699f4634 100644 --- a/tests/integration/test_multiple_disks/configs/config.d/storage_configuration.xml +++ b/tests/integration/test_multiple_disks/configs/config.d/storage_configuration.xml @@ -122,6 +122,7 @@ +true 1 diff --git a/tests/integration/test_rename_column/configs/config.d/storage_configuration.xml b/tests/integration/test_rename_column/configs/config.d/storage_configuration.xml index 12a598c64b5..65cac905e9a 100644 --- a/tests/integration/test_rename_column/configs/config.d/storage_configuration.xml +++ b/tests/integration/test_rename_column/configs/config.d/storage_configuration.xml @@ -29,5 +29,6 @@ 0 1 +true diff --git a/tests/integration/test_replicated_merge_tree_hdfs_zero_copy/configs/config.d/storage_conf.xml b/tests/integration/test_replicated_merge_tree_hdfs_zero_copy/configs/config.d/storage_conf.xml index cb444c728c9..bb4aba94e0b 100644 --- a/tests/integration/test_replicated_merge_tree_hdfs_zero_copy/configs/config.d/storage_conf.xml +++ b/tests/integration/test_replicated_merge_tree_hdfs_zero_copy/configs/config.d/storage_conf.xml @@ -89,4 +89,5 @@ test_cluster 1
+ true diff --git a/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml b/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml index f7d9efc2cae..63162c3c19b 100644 --- a/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml +++ b/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml @@ -93,4 +93,5 @@ test_cluster
+ true diff --git a/tests/integration/test_ttl_move/configs/config.d/storage_configuration.xml b/tests/integration/test_ttl_move/configs/config.d/storage_configuration.xml index ae1dc9dd038..09e6fc99411 100644 --- a/tests/integration/test_ttl_move/configs/config.d/storage_configuration.xml +++ b/tests/integration/test_ttl_move/configs/config.d/storage_configuration.xml @@ -107,4 +107,5 @@ +true diff --git a/tests/integration/test_zero_copy_fetch/configs/storage_conf.xml b/tests/integration/test_zero_copy_fetch/configs/storage_conf.xml index b3ce0735a3c..9e9dab6a972 100644 --- a/tests/integration/test_zero_copy_fetch/configs/storage_conf.xml +++ b/tests/integration/test_zero_copy_fetch/configs/storage_conf.xml @@ -38,4 +38,5 @@ true + true From ac638615aeace304b6a196e68561d73a03e86344 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 8 Jun 2023 10:29:01 +0000 Subject: [PATCH 0648/1072] Upd test_alter_moving_garbage --- .../test_alter_moving_garbage/test.py | 39 ++++++++++++------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/tests/integration/test_alter_moving_garbage/test.py b/tests/integration/test_alter_moving_garbage/test.py index dc3f6c35ead..330df3ac490 100644 --- a/tests/integration/test_alter_moving_garbage/test.py +++ b/tests/integration/test_alter_moving_garbage/test.py @@ -11,6 +11,7 @@ from helpers.cluster import ClickHouseCluster # two replicas in remote_servers.xml REPLICA_COUNT = 2 + @pytest.fixture(scope="module") def cluster(): try: @@ -82,7 +83,7 @@ def test_create_table( additional_settings = {} - # different names for logs readability + # Different names for logs readability table_name = "test_table" if allow_remote_fs_zero_copy_replication: table_name = "test_table_zero_copy" @@ -99,17 +100,7 @@ def test_create_table( f"INSERT INTO {table_name} SELECT toDate('{partition}'), number as id, toString(sipHash64(number, {i})) FROM numbers(10_000)" ) - def check_count(): - if replicated_engine: - return random.choice(nodes).query_with_retry( - f"SELECT countDistinct(dt, data) FROM clusterAllReplicas(test_cluster, default.{table_name}) WHERE id % 100 = 0" - ) - else: - return random.choice(nodes).query( - f"SELECT countDistinct(dt, data) FROM {table_name} WHERE id % 100 = 0" - ) - - assert check_count() == "1000\n" + # Run ALTER in parallel with moving parts stop_alter = False @@ -118,9 +109,14 @@ def test_create_table( for d in range(1, 100): if stop_alter: break - # I managed to reproduce issue with DELETE, but it can be any other lightweight mutation - # Do not delete rows with id % 100 = 0, because they are used in check_count to check that data is not corrupted + + # Some lightweight mutation should change moving part before it is swapped, then we will have to cleanup it. + # Messages `Failed to swap {}. Active part doesn't exist` should appear in logs. + # + # I managed to reproduce issue with DELETE (`ALTER TABLE {table_name} ADD/DROP COLUMN` also works on real s3 instead of minio) + # Note: do not delete rows with id % 100 = 0, because they are used in `check_count` to use them in check that data is not corrupted random.choice(nodes).query(f"DELETE FROM {table_name} WHERE id % 100 = {d}") + time.sleep(0.1) alter_thread = threading.Thread(target=alter) @@ -143,4 +139,17 @@ def test_create_table( stop_alter = True alter_thread.join() - assert check_count() == "1000\n" + # Check that no data was lost + + data_digest = None + if replicated_engine: + # We don't know what data was replicated, so we need to check all replicas and take unique values + data_digest = random.choice(nodes).query_with_retry( + f"SELECT countDistinct(dt, data) FROM clusterAllReplicas(test_cluster, default.{table_name}) WHERE id % 100 == 0" + ) + else: + data_digest = random.choice(nodes).query( + f"SELECT countDistinct(dt, data) FROM {table_name} WHERE id % 100 == 0" + ) + + assert data_digest == "1000\n" From 32d1acb3a57b0c1e942c64b7c434701f37fa1910 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 8 Jun 2023 12:29:26 +0200 Subject: [PATCH 0649/1072] Add changelog for 23.5 --- CHANGELOG.md | 253 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 253 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ccd4f9846d..f2ffdad9a7c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,5 @@ ### Table of Contents +**[ClickHouse release v23.5, 2023-06-08](#235)**
**[ClickHouse release v23.4, 2023-04-26](#234)**
**[ClickHouse release v23.3 LTS, 2023-03-30](#233)**
**[ClickHouse release v23.2, 2023-02-23](#232)**
@@ -7,6 +8,258 @@ # 2023 Changelog +### ClickHouse release 23.5, 2023-06-08 + +#### Upgrade Notes +* Compress marks and primary key by default. It significantly reduces the cold query time. Upgrade notes: the support for compressed marks and primary key has been added in version 22.9. If you turned on compressed marks or primary key or installed version 23.5 or newer, which has compressed marks or primary key on by default, you will not be able to downgrade to version 22.8 or earlier. You can also explicitly disable compressed marks or primary keys by specifying the `compress_marks` and `compress_primary_key` settings in the `` section of the server configuration file. **Upgrade notes:** If you upgrade from versions prior to 22.9, you should either upgrade all replicas at once or disable the compression before upgrade, or upgrade through an intermediate version, where the compressed marks are supported but not enabled by default, such as 23.3. [#42587](https://github.com/ClickHouse/ClickHouse/pull/42587) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Make local object storage work consistently with s3 object storage, fix problem with append (closes [#48465](https://github.com/ClickHouse/ClickHouse/issues/48465)), make it configurable as independent storage. The change is backward incompatible because the cache on top of local object storage is not incompatible to previous versions. [#48791](https://github.com/ClickHouse/ClickHouse/pull/48791) ([Kseniia Sumarokova](https://github.com/kssenii)). +* The experimental feature "in-memory data parts" is removed. The data format is still supported, but the settings are no-op, and compact or wide parts will be used instead. This closes [#45409](https://github.com/ClickHouse/ClickHouse/issues/45409). [#49429](https://github.com/ClickHouse/ClickHouse/pull/49429) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Changed default values of settings `parallelize_output_from_storages` and `input_format_parquet_preserve_order`. This allows ClickHouse to reorder rows when reading from files (e.g. CSV or Parquet), greatly improving performance in many cases. To restore the old behavior of preserving order, use `parallelize_output_from_storages = 0`, `input_format_parquet_preserve_order = 1`. [#49479](https://github.com/ClickHouse/ClickHouse/pull/49479) ([Michael Kolupaev](https://github.com/al13n321)). +* Make projections production-ready. Add the `optimize_use_projections` setting to control whether the projections will be selected for SELECT queries. The setting `allow_experimental_projection_optimization` is obsolete and does nothing. [#49719](https://github.com/ClickHouse/ClickHouse/pull/49719) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Mark `joinGet` as non deterministic (so as `dictGet`). It allows using them in mutations without an extra setting. [#49843](https://github.com/ClickHouse/ClickHouse/pull/49843) ([Azat Khuzhin](https://github.com/azat)). +* Revert the "`groupArray` returns cannot be nullable" change (due to binary compatibility breakage for `groupArray`/`groupArrayLast`/`groupArraySample` over `Nullable` types, which likely will lead to `TOO_LARGE_ARRAY_SIZE` or `CANNOT_READ_ALL_DATA`). [#49971](https://github.com/ClickHouse/ClickHouse/pull/49971) ([Azat Khuzhin](https://github.com/azat)). +* Setting `enable_memory_bound_merging_of_aggregation_results` is enabled by default. If you update from version prior to 22.12, we recommend to set this flag to `false` until update is finished. [#50319](https://github.com/ClickHouse/ClickHouse/pull/50319) ([Nikita Taranov](https://github.com/nickitat)). + +#### New Feature +* Added native ClickHouse Keeper CLI Client, it is available as `clickhouse keeper-client` [#47414](https://github.com/ClickHouse/ClickHouse/pull/47414) ([pufit](https://github.com/pufit)). +* Add `urlCluster` table function. Refactor all *Cluster table functions to reduce code duplication. Make schema inference work for all possible *Cluster function signatures and for named collections. Closes [#38499](https://github.com/ClickHouse/ClickHouse/issues/38499). [#45427](https://github.com/ClickHouse/ClickHouse/pull/45427) ([attack204](https://github.com/attack204)), Pavel Kruglov. +* The query cache can now be used for production workloads. [#47977](https://github.com/ClickHouse/ClickHouse/pull/47977) ([Robert Schulze](https://github.com/rschu1ze)). The query cache can now support queries with totals and extremes modifier. [#48853](https://github.com/ClickHouse/ClickHouse/pull/48853) ([Robert Schulze](https://github.com/rschu1ze)). Make `allow_experimental_query_cache` setting as obsolete for backward-compatibility. It was removed in https://github.com/ClickHouse/ClickHouse/pull/47977. [#49934](https://github.com/ClickHouse/ClickHouse/pull/49934) ([Timur Solodovnikov](https://github.com/tsolodov)). +* Geographical data types (`Point`, `Ring`, `Polygon`, and `MultiPolygon`) are production-ready. [#50022](https://github.com/ClickHouse/ClickHouse/pull/50022) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add schema inference to PostgreSQL, MySQL, MeiliSearch, and SQLite table engines. Closes [#49972](https://github.com/ClickHouse/ClickHouse/issues/49972). [#50000](https://github.com/ClickHouse/ClickHouse/pull/50000) ([Nikolay Degterinsky](https://github.com/evillique)). +* Password type in queries like `CREATE USER u IDENTIFIED BY 'p'` will be automatically set according to the setting `default_password_type` in the `config.xml` on the server. Closes [#42915](https://github.com/ClickHouse/ClickHouse/issues/42915). [#44674](https://github.com/ClickHouse/ClickHouse/pull/44674) ([Nikolay Degterinsky](https://github.com/evillique)). +* Add bcrypt password authentication type. Closes [#34599](https://github.com/ClickHouse/ClickHouse/issues/34599). [#44905](https://github.com/ClickHouse/ClickHouse/pull/44905) ([Nikolay Degterinsky](https://github.com/evillique)). +* Introduces new keyword `INTO OUTFILE 'file.txt' APPEND`. [#48880](https://github.com/ClickHouse/ClickHouse/pull/48880) ([alekar](https://github.com/alekar)). +* Added `system.zookeeper_connection` table that shows information about Keeper connections. [#45245](https://github.com/ClickHouse/ClickHouse/pull/45245) ([mateng915](https://github.com/mateng0915)). +* Add new function `generateRandomStructure` that generates random table structure. It can be used in combination with table function `generateRandom`. [#47409](https://github.com/ClickHouse/ClickHouse/pull/47409) ([Kruglov Pavel](https://github.com/Avogar)). +* Allow the use of `CASE` without an `ELSE` branch and extended `transform` to deal with more types. Also fix some issues that made transform() return incorrect results when decimal types were mixed with other numeric types. [#48300](https://github.com/ClickHouse/ClickHouse/pull/48300) ([Salvatore Mesoraca](https://github.com/aiven-sal)). +* Added [server-side encryption using KMS keys](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html) with S3 tables, and the `header` setting with S3 disks. Closes [#48723](https://github.com/ClickHouse/ClickHouse/issues/48723). [#48724](https://github.com/ClickHouse/ClickHouse/pull/48724) ([Johann Gan](https://github.com/johanngan)). +* Add MemoryTracker for the background tasks (merges and mutation). Introduces `merges_mutations_memory_usage_soft_limit` and `merges_mutations_memory_usage_to_ram_ratio` settings that represent the soft memory limit for merges and mutations. If this limit is reached ClickHouse won't schedule new merge or mutation tasks. Also `MergesMutationsMemoryTracking` metric is introduced to allow observing current memory usage of background tasks. Resubmit [#46089](https://github.com/ClickHouse/ClickHouse/issues/46089). Closes [#48774](https://github.com/ClickHouse/ClickHouse/issues/48774). [#48787](https://github.com/ClickHouse/ClickHouse/pull/48787) ([Dmitry Novik](https://github.com/novikd)). +* Function `dotProduct` work for array. [#49050](https://github.com/ClickHouse/ClickHouse/pull/49050) ([FFFFFFFHHHHHHH](https://github.com/FFFFFFFHHHHHHH)). +* Support statement `SHOW INDEX` to improve compatibility with MySQL. [#49158](https://github.com/ClickHouse/ClickHouse/pull/49158) ([Robert Schulze](https://github.com/rschu1ze)). +* Add virtual column `_file` and `_path` support to table function `url`. - Impove error message for table function `url`. - resolves [#49231](https://github.com/ClickHouse/ClickHouse/issues/49231) - resolves [#49232](https://github.com/ClickHouse/ClickHouse/issues/49232). [#49356](https://github.com/ClickHouse/ClickHouse/pull/49356) ([Ziyi Tan](https://github.com/Ziy1-Tan)). +* Adding the `grants` field in the users.xml file, which allows specifying grants for users. [#49381](https://github.com/ClickHouse/ClickHouse/pull/49381) ([pufit](https://github.com/pufit)). +* Support full/right join by using grace hash join algorithm. [#49483](https://github.com/ClickHouse/ClickHouse/pull/49483) ([lgbo](https://github.com/lgbo-ustc)). +* `WITH FILL` modifier groups filling by sorting prefix. Controlled by `use_with_fill_by_sorting_prefix` setting (enabled by default). Related to [#33203](https://github.com/ClickHouse/ClickHouse/issues/33203)#issuecomment-1418736794. [#49503](https://github.com/ClickHouse/ClickHouse/pull/49503) ([Igor Nikonov](https://github.com/devcrafter)). +* Clickhouse-client now accepts queries after "--multiquery" when "--query" (or "-q") is absent. example: clickhouse-client --multiquery "select 1; select 2;". [#49870](https://github.com/ClickHouse/ClickHouse/pull/49870) ([Alexey Gerasimchuk](https://github.com/Demilivor)). +* Add separate `handshake_timeout` for receiving Hello packet from replica. Closes [#48854](https://github.com/ClickHouse/ClickHouse/issues/48854). [#49948](https://github.com/ClickHouse/ClickHouse/pull/49948) ([Kruglov Pavel](https://github.com/Avogar)). +* Added a function "space" which repeats a space as many times as specified. [#50103](https://github.com/ClickHouse/ClickHouse/pull/50103) ([Robert Schulze](https://github.com/rschu1ze)). +* Added --input_format_csv_trim_whitespaces option. [#50215](https://github.com/ClickHouse/ClickHouse/pull/50215) ([Alexey Gerasimchuk](https://github.com/Demilivor)). +* Allow the `dictGetAll` function for regexp tree dictionaries to return values from multiple matches as arrays. Closes [#50254](https://github.com/ClickHouse/ClickHouse/issues/50254). [#50255](https://github.com/ClickHouse/ClickHouse/pull/50255) ([Johann Gan](https://github.com/johanngan)). +* Added `toLastDayOfWeek` function to round a date or a date with time up to the nearest Saturday or Sunday. [#50315](https://github.com/ClickHouse/ClickHouse/pull/50315) ([Victor Krasnov](https://github.com/sirvickr)). +* Ability to ignore a skip index by specifying `ignore_data_skipping_indices`. [#50329](https://github.com/ClickHouse/ClickHouse/pull/50329) ([Boris Kuschel](https://github.com/bkuschel)). +* Add `system.user_processes` table and `SHOW USER PROCESSES` query to show memory info and ProfileEvents on user level. [#50492](https://github.com/ClickHouse/ClickHouse/pull/50492) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Add server and format settings `display_secrets_in_show_and_select` for displaying secrets of tables, databases, table functions, and dictionaries. Add privilege `displaySecretsInShowAndSelect` controlling which users can view secrets. [#46528](https://github.com/ClickHouse/ClickHouse/pull/46528) ([Mike Kot](https://github.com/myrrc)). +* Allow to set up a ROW POLICY for all tables that belong to a DATABASE. [#47640](https://github.com/ClickHouse/ClickHouse/pull/47640) ([Ilya Golshtein](https://github.com/ilejn)). + +#### Performance Improvement +* Compress marks and primary key by default. It significantly reduces the cold query time. Upgrade notes: the support for compressed marks and primary key has been added in version 22.9. If you turned on compressed marks or primary key or installed version 23.5 or newer, which has compressed marks or primary key on by default, you will not be able to downgrade to version 22.8 or earlier. You can also explicitly disable compressed marks or primary keys by specifying the `compress_marks` and `compress_primary_key` settings in the `` section of the server configuration file. [#42587](https://github.com/ClickHouse/ClickHouse/pull/42587) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* New setting s3_max_inflight_parts_for_one_file sets the limit of concurrently loaded parts with multipart upload request in scope of one file. [#49961](https://github.com/ClickHouse/ClickHouse/pull/49961) ([Sema Checherinda](https://github.com/CheSema)). +* When reading from multiple files reduce parallel parsing threads for each file. Resolves [#42192](https://github.com/ClickHouse/ClickHouse/issues/42192). [#46661](https://github.com/ClickHouse/ClickHouse/pull/46661) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Use aggregate projection only if it reads fewer granules than normal reading. It should help in case if query hits the PK of the table, but not the projection. Fixes [#49150](https://github.com/ClickHouse/ClickHouse/issues/49150). [#49417](https://github.com/ClickHouse/ClickHouse/pull/49417) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Do not store blocks in `ANY` hash join if nothing is inserted. [#48633](https://github.com/ClickHouse/ClickHouse/pull/48633) ([vdimir](https://github.com/vdimir)). +* Fixes aggregate combinator `-If` when JIT compiled, and enable JIT compilation for aggregate functions. Closes [#48120](https://github.com/ClickHouse/ClickHouse/issues/48120). [#49083](https://github.com/ClickHouse/ClickHouse/pull/49083) ([Igor Nikonov](https://github.com/devcrafter)). +* For reading from remote tables we use smaller tasks (instead of reading the whole part) to make tasks stealing work * task size is determined by size of columns to read * always use 1mb buffers for reading from s3 * boundaries of cache segments aligned to 1mb so they have decent size even with small tasks. it also should prevent fragmentation. [#49287](https://github.com/ClickHouse/ClickHouse/pull/49287) ([Nikita Taranov](https://github.com/nickitat)). +* Introduced settings: - `merge_max_block_size_bytes` to limit the amount of memory used for background operations. - `vertical_merge_algorithm_min_bytes_to_activate` to add another condition to activate vertical merges. [#49313](https://github.com/ClickHouse/ClickHouse/pull/49313) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Default size of a read buffer for reading from local filesystem changed to a slightly better value. Also two new settings are introduced: `max_read_buffer_size_local_fs` and `max_read_buffer_size_remote_fs`. [#49321](https://github.com/ClickHouse/ClickHouse/pull/49321) ([Nikita Taranov](https://github.com/nickitat)). +* Improve memory usage and speed of `SPARSE_HASHED`/`HASHED` dictionaries (e.g. `SPARSE_HASHED` now eats 2.6x less memory, and is ~2x faster). [#49380](https://github.com/ClickHouse/ClickHouse/pull/49380) ([Azat Khuzhin](https://github.com/azat)). +* Optimize the `system.query_log` and `system.query_thread_log` tables by applying `LowCardinality` when appropriate. The queries over these tables will be faster. [#49530](https://github.com/ClickHouse/ClickHouse/pull/49530) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Better performance when reading local `Parquet` files (through parallel reading). [#49539](https://github.com/ClickHouse/ClickHouse/pull/49539) ([Michael Kolupaev](https://github.com/al13n321)). +* Improve the performance of `RIGHT/FULL JOIN` by up to 2 times in certain scenarios, especially when joining a small left table with a large right table. [#49585](https://github.com/ClickHouse/ClickHouse/pull/49585) ([lgbo](https://github.com/lgbo-ustc)). +* Improve performance of BLAKE3 by 11% by enabling LTO for Rust. [#49600](https://github.com/ClickHouse/ClickHouse/pull/49600) ([Azat Khuzhin](https://github.com/azat)). Now it is on par with C++. +* Optimize the structure of the `system.opentelemetry_span_log`. Use `LowCardinality` where appropriate. Although this table is generally stupid (it is using the Map data type even for common attributes), it will be slightly better. [#49647](https://github.com/ClickHouse/ClickHouse/pull/49647) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Try to reserve hash table's size in `grace_hash` join. [#49816](https://github.com/ClickHouse/ClickHouse/pull/49816) ([lgbo](https://github.com/lgbo-ustc)). +* As is addresed in issue [#49748](https://github.com/ClickHouse/ClickHouse/issues/49748), the predicates with date converters, such as `toYear`, `toYYYYMM`, could be rewritten with the equivalent date (YYYY-MM-DD) comparisons at the AST level. And this transformation could bring performance improvement as it is free from the expensive date converter and the comparison between dates (or integers in the low level representation) is quite low-cost. The [prototype](https://github.com/ZhiguoZh/ClickHouse/commit/c7f1753f0c9363a19d95fa46f1cfed1d9f505ee0) shows that, with all identified date converters optimized, the overall QPS of the 13 queries is enhanced by **~11%** on the ICX server (Intel Xeon Platinum 8380 CPU, 80 cores, 160 threads). [#50062](https://github.com/ClickHouse/ClickHouse/pull/50062) [#50307](https://github.com/ClickHouse/ClickHouse/pull/50307) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). +* Parallel merge of `uniqExactIf` states. Closes [#49885](https://github.com/ClickHouse/ClickHouse/issues/49885). [#50285](https://github.com/ClickHouse/ClickHouse/pull/50285) ([flynn](https://github.com/ucasfl)). +* Keeper improvement: add `CheckNotExists` request to Keeper, which allows to improve the performance of Replicated tables. [#48897](https://github.com/ClickHouse/ClickHouse/pull/48897) ([Antonio Andelic](https://github.com/antonio2368)). +* Keeper performance improvements: avoid serializing same request twice while processing. Cache deserialization results of large requests. Controlled by new coordination setting `min_request_size_for_cache`. [#49004](https://github.com/ClickHouse/ClickHouse/pull/49004) ([Antonio Andelic](https://github.com/antonio2368)). +* Reduced number of `List` ZooKeeper requests when selecting parts to merge and a lot of partitions do not have anything to merge. [#49637](https://github.com/ClickHouse/ClickHouse/pull/49637) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Rework locking in the FS cache [#44985](https://github.com/ClickHouse/ClickHouse/pull/44985) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Disable pure parallel replicas if trivial count optimization is possible. [#50594](https://github.com/ClickHouse/ClickHouse/pull/50594) ([Raúl Marín](https://github.com/Algunenano)). +* Don't send head request for all keys in Iceberg schema inference, only for keys that are used for reaing data. [#50203](https://github.com/ClickHouse/ClickHouse/pull/50203) ([Kruglov Pavel](https://github.com/Avogar)). +* Setting `enable_memory_bound_merging_of_aggregation_results` is enabled by default. [#50319](https://github.com/ClickHouse/ClickHouse/pull/50319) ([Nikita Taranov](https://github.com/nickitat)). + +#### Experimental Feature +* `DEFLATE_QPL` codec lower the minimum simd version to SSE 4.2. [doc change in qpl](https://github.com/intel/qpl/commit/3f8f5cea27739f5261e8fd577dc233ffe88bf679) - Intel® QPL relies on a run-time kernels dispatcher and cpuid check to choose the best available implementation(sse/avx2/avx512) - restructured cmakefile for qpl build in clickhouse to align with latest upstream qpl. [#49811](https://github.com/ClickHouse/ClickHouse/pull/49811) ([jasperzhu](https://github.com/jinjunzh)). +* Add initial support to do JOINs with pure parallel replicas. [#49544](https://github.com/ClickHouse/ClickHouse/pull/49544) ([Raúl Marín](https://github.com/Algunenano)). +* More parallelism on `Outdated` parts removal with "zero-copy replication". [#49630](https://github.com/ClickHouse/ClickHouse/pull/49630) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Parallel Replicas: 1) Fixed an error `NOT_FOUND_COLUMN_IN_BLOCK` in case of using parallel replicas with non-replicated storage with disabled setting `parallel_replicas_for_non_replicated_merge_tree` 2) Now `allow_experimental_parallel_reading_from_replicas` have 3 possible values - 0, 1 and 2. 0 - disabled, 1 - enabled, silently disable them in case of failure (in case of FINAL or JOIN), 2 - enabled, throw an expection in case of failure. 3) If FINAL modifier is used in SELECT query and parallel replicas are enabled, ClickHouse will try to disable them if `allow_experimental_parallel_reading_from_replicas` is set to 1 and throw an exception otherwise. [#50195](https://github.com/ClickHouse/ClickHouse/pull/50195) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* When parallel replicas are enabled they will always skip unavailable servers (the behavior is controlled by the setting `skip_unavailable_shards`, enabled by default and can be only disabled). This closes: [#48565](https://github.com/ClickHouse/ClickHouse/issues/48565). [#50293](https://github.com/ClickHouse/ClickHouse/pull/50293) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). + +#### Improvement +* The `BACKUP` command will not decrypt data from encrypted disks while making a backup. Instead the data will be stored in a backup in encrypted form. Such backups can be restored only to an encrypted disk with the same (or extended) list of encryption keys. [#48896](https://github.com/ClickHouse/ClickHouse/pull/48896) ([Vitaly Baranov](https://github.com/vitlibar)). +* Added possibility to use temporary tables in FROM part of ATTACH PARTITION FROM and REPLACE PARTITION FROM. [#49436](https://github.com/ClickHouse/ClickHouse/pull/49436) ([Roman Vasin](https://github.com/rvasin)). +* Added setting `async_insert` for `MergeTree` tables. It has the same meaning as query-level setting `async_insert` and enables asynchronous inserts for specific table. Note: it doesn't take effect for insert queries from `clickhouse-client`, use query-level setting in that case. [#49122](https://github.com/ClickHouse/ClickHouse/pull/49122) ([Anton Popov](https://github.com/CurtizJ)). +* Add support for size suffixes in quota creation statement parameters. [#49087](https://github.com/ClickHouse/ClickHouse/pull/49087) ([Eridanus](https://github.com/Eridanus117)). +* Extend `first_value` and `last_value` to accept NULL. [#46467](https://github.com/ClickHouse/ClickHouse/pull/46467) ([lgbo](https://github.com/lgbo-ustc)). +* Add alias `str_to_map` and `mapFromString` for `extractKeyValuePairs`. closes https://github.com/clickhouse/clickhouse/issues/47185. [#49466](https://github.com/ClickHouse/ClickHouse/pull/49466) ([flynn](https://github.com/ucasfl)). +* Add support for CGroup version 2 for asynchronous metrics about the memory usage and availability. This closes [#37983](https://github.com/ClickHouse/ClickHouse/issues/37983). [#45999](https://github.com/ClickHouse/ClickHouse/pull/45999) ([sichenzhao](https://github.com/sichenzhao)). +* Cluster table functions should always skip unavailable shards. close [#46314](https://github.com/ClickHouse/ClickHouse/issues/46314). [#46765](https://github.com/ClickHouse/ClickHouse/pull/46765) ([zk_kiger](https://github.com/zk-kiger)). +* Allow CSV file to contain empty columns in its header. [#47496](https://github.com/ClickHouse/ClickHouse/pull/47496) ([你不要过来啊](https://github.com/iiiuwioajdks)). +* Add Google Cloud Storage S3 compatible table function `gcs`. Like the `oss` and `cosn` functions, it is just an alias over the `s3` table function, and it does not bring any new features. [#47815](https://github.com/ClickHouse/ClickHouse/pull/47815) ([Kuba Kaflik](https://github.com/jkaflik)). +* Add ability to use strict parts size for S3 (compatibility with CloudFlare R2 S3 Storage). [#48492](https://github.com/ClickHouse/ClickHouse/pull/48492) ([Azat Khuzhin](https://github.com/azat)). +* Added new columns with info about `Replicated` database replicas to `system.clusters`: `database_shard_name`, `database_replica_name`, `is_active`. Added an optional `FROM SHARD` clause to `SYSTEM DROP DATABASE REPLICA` query. [#48548](https://github.com/ClickHouse/ClickHouse/pull/48548) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Add a new column `zookeeper_name` in system.replicas, to indicate on which (auxiliary) zookeeper cluster the replicated table's metadata is stored. [#48549](https://github.com/ClickHouse/ClickHouse/pull/48549) ([cangyin](https://github.com/cangyin)). +* `IN` operator support the comparison of `Date` and `Date32`. Closes [#48736](https://github.com/ClickHouse/ClickHouse/issues/48736). [#48806](https://github.com/ClickHouse/ClickHouse/pull/48806) ([flynn](https://github.com/ucasfl)). +* Support for erasure codes in `HDFS`, author: @M1eyu2018, @tomscut. [#48833](https://github.com/ClickHouse/ClickHouse/pull/48833) ([M1eyu](https://github.com/M1eyu2018)). +* Implement SYSTEM DROP REPLICA from auxillary ZooKeeper clusters, may be close [#48931](https://github.com/ClickHouse/ClickHouse/issues/48931). [#48932](https://github.com/ClickHouse/ClickHouse/pull/48932) ([wangxiaobo](https://github.com/wzb5212)). +* Add Array data type to MongoDB. Closes [#48598](https://github.com/ClickHouse/ClickHouse/issues/48598). [#48983](https://github.com/ClickHouse/ClickHouse/pull/48983) ([Nikolay Degterinsky](https://github.com/evillique)). +* Support storing `Interval` data types in tables. [#49085](https://github.com/ClickHouse/ClickHouse/pull/49085) ([larryluogit](https://github.com/larryluogit)). +* Allow using `ntile` window function without explicit window frame definition: `ntile(3) OVER (ORDER BY a)`, close [#46763](https://github.com/ClickHouse/ClickHouse/issues/46763). [#49093](https://github.com/ClickHouse/ClickHouse/pull/49093) ([vdimir](https://github.com/vdimir)). +* Added settings (`number_of_mutations_to_delay`, `number_of_mutations_to_throw`) to delay or throw `ALTER` queries that create mutations (`ALTER UPDATE`, `ALTER DELETE`, `ALTER MODIFY COLUMN`, ...) in case when table already has a lot of unfinished mutations. [#49117](https://github.com/ClickHouse/ClickHouse/pull/49117) ([Anton Popov](https://github.com/CurtizJ)). +* Catch exception from `create_directories` in filesystem cache. [#49203](https://github.com/ClickHouse/ClickHouse/pull/49203) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Copies embedded examples to a new field `example` in `system.functions` to supplement the field `description`. [#49222](https://github.com/ClickHouse/ClickHouse/pull/49222) ([Dan Roscigno](https://github.com/DanRoscigno)). +* Enable connection options for the MongoDB dictionary. Example: ``` xml localhost 27017 test dictionary_source ssl=true ``` ### Documentation entry for user-facing changes. [#49225](https://github.com/ClickHouse/ClickHouse/pull/49225) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). +* Added an alias `asymptotic` for `asymp` computational method for `kolmogorovSmirnovTest`. Improved documentation. [#49286](https://github.com/ClickHouse/ClickHouse/pull/49286) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Aggregation function groupBitAnd/Or/Xor now work on signed integer data. This makes them consistent with the behavior of scalar functions bitAnd/Or/Xor. [#49292](https://github.com/ClickHouse/ClickHouse/pull/49292) ([exmy](https://github.com/exmy)). +* Split function-documentation into more fine-granular fields. [#49300](https://github.com/ClickHouse/ClickHouse/pull/49300) ([Robert Schulze](https://github.com/rschu1ze)). +* Use multiple threads shared between all tables within a server to load outdated data parts. The the size of the pool and its queue is controlled by `max_outdated_parts_loading_thread_pool_size` and `outdated_part_loading_thread_pool_queue_size` settings. [#49317](https://github.com/ClickHouse/ClickHouse/pull/49317) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Don't overestimate the size of processed data for `LowCardinality` columns when they share dictionaries between blocks. This closes [#49322](https://github.com/ClickHouse/ClickHouse/issues/49322). See also [#48745](https://github.com/ClickHouse/ClickHouse/issues/48745). [#49323](https://github.com/ClickHouse/ClickHouse/pull/49323) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Parquet writer now uses reasonable row group size when invoked through `OUTFILE`. [#49325](https://github.com/ClickHouse/ClickHouse/pull/49325) ([Michael Kolupaev](https://github.com/al13n321)). +* Allow restricted keywords like `ARRAY` as an alias if the alias is quoted. Closes [#49324](https://github.com/ClickHouse/ClickHouse/issues/49324). [#49360](https://github.com/ClickHouse/ClickHouse/pull/49360) ([Nikolay Degterinsky](https://github.com/evillique)). +* Data parts loading and deletion jobs were moved to shared server-wide pools instead of per-table pools. Pools sizes are controlled via settings `max_active_parts_loading_thread_pool_size`, `max_outdated_parts_loading_thread_pool_size` and `max_parts_cleaning_thread_pool_size` in top-level config. Table-level settings `max_part_loading_threads` and `max_part_removal_threads` became obsolete. [#49474](https://github.com/ClickHouse/ClickHouse/pull/49474) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Allow `?password=pass` in URL of the Play UI. Password is replaced in browser history. [#49505](https://github.com/ClickHouse/ClickHouse/pull/49505) ([Mike Kot](https://github.com/myrrc)). +* Allow reading zero-size objects from remote filesystems. (because empty files are not backup'd, so we might end up with zero blobs in metadata file). Closes [#49480](https://github.com/ClickHouse/ClickHouse/issues/49480). [#49519](https://github.com/ClickHouse/ClickHouse/pull/49519) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Attach thread MemoryTracker to `total_memory_tracker` after `ThreadGroup` detached. [#49527](https://github.com/ClickHouse/ClickHouse/pull/49527) ([Dmitry Novik](https://github.com/novikd)). +* Fix parameterized views when a query parameter is used multiple times in the query. [#49556](https://github.com/ClickHouse/ClickHouse/pull/49556) ([Azat Khuzhin](https://github.com/azat)). +* Release memory allocated for the last sent ProfileEvents snapshot in the context of a query. Followup [#47564](https://github.com/ClickHouse/ClickHouse/issues/47564). [#49561](https://github.com/ClickHouse/ClickHouse/pull/49561) ([Dmitry Novik](https://github.com/novikd)). +* Function "makeDate" now provides a MySQL-compatible overload (year & day of the year argument). [#49603](https://github.com/ClickHouse/ClickHouse/pull/49603) ([Robert Schulze](https://github.com/rschu1ze)). +* Support `dictionary` table function for `RegExpTreeDictionary`. [#49666](https://github.com/ClickHouse/ClickHouse/pull/49666) ([Han Fei](https://github.com/hanfei1991)). +* Added weighted fair IO scheduling policy. Added dynamic resource manager, which allows IO scheduling hierarchy to be updated in runtime w/o server restarts. [#49671](https://github.com/ClickHouse/ClickHouse/pull/49671) ([Sergei Trifonov](https://github.com/serxa)). +* Add compose request after multipart upload to GCS. This enables the usage of copy operation on objects uploaded with the multipart upload. It's recommended to set `s3_strict_upload_part_size` to some value because compose request can fail on objects created with parts of different sizes. [#49693](https://github.com/ClickHouse/ClickHouse/pull/49693) ([Antonio Andelic](https://github.com/antonio2368)). +* For the `extractKeyValuePairs` function: improve the "best-effort" parsing logic to accept `key_value_delimiter` as a valid part of the value. This also simplifies branching and might even speed up things a bit. [#49760](https://github.com/ClickHouse/ClickHouse/pull/49760) ([Arthur Passos](https://github.com/arthurpassos)). +* Add `initial_query_id` field for system.processors_profile_log [#49777](https://github.com/ClickHouse/ClickHouse/pull/49777) ([helifu](https://github.com/helifu)). +* System log tables can now have custom sorting keys. [#49778](https://github.com/ClickHouse/ClickHouse/pull/49778) ([helifu](https://github.com/helifu)). +* A new field `partitions` to `system.query_log` is used to indicate which partitions are participating in the calculation. [#49779](https://github.com/ClickHouse/ClickHouse/pull/49779) ([helifu](https://github.com/helifu)). +* Added `enable_the_endpoint_id_with_zookeeper_name_prefix` setting for `ReplicatedMergeTree` (disabled by default). When enabled, it adds ZooKeeper cluster name to table's interserver communication endpoint. It avoids `Duplicate interserver IO endpoint` errors when having replicated tables with the same path, but different auxiliary ZooKeepers. [#49780](https://github.com/ClickHouse/ClickHouse/pull/49780) ([helifu](https://github.com/helifu)). +* Add query parameters to `clickhouse-local`. Closes [#46561](https://github.com/ClickHouse/ClickHouse/issues/46561). [#49785](https://github.com/ClickHouse/ClickHouse/pull/49785) ([Nikolay Degterinsky](https://github.com/evillique)). +* Allow loading dictionaries and functions from YAML by default. In previous versions, it required editing the `dictionaries_config` or `user_defined_executable_functions_config` in the configuration file, as they expected `*.xml` files. [#49812](https://github.com/ClickHouse/ClickHouse/pull/49812) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* The Kafka table engine now allows to use alias columns. [#49824](https://github.com/ClickHouse/ClickHouse/pull/49824) ([Aleksandr Musorin](https://github.com/AVMusorin)). +* Add setting to limit the max number of pairs produced by `extractKeyValuePairs`, a safeguard to avoid using way too much memory. [#49836](https://github.com/ClickHouse/ClickHouse/pull/49836) ([Arthur Passos](https://github.com/arthurpassos)). +* Add support for (an unusual) case where the arguments in the `IN` operator are single-element tuples. [#49844](https://github.com/ClickHouse/ClickHouse/pull/49844) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). +* `bitHammingDistance` function support `String` and `FixedString` data type. Closes [#48827](https://github.com/ClickHouse/ClickHouse/issues/48827). [#49858](https://github.com/ClickHouse/ClickHouse/pull/49858) ([flynn](https://github.com/ucasfl)). +* Fix timeout resetting errors in the client on OS X. [#49863](https://github.com/ClickHouse/ClickHouse/pull/49863) ([alekar](https://github.com/alekar)). +* Add support for big integers, such as UInt128, Int128, UInt256, and Int256 in the function `bitCount`. This enables Hamming distance over large bit masks for AI applications. [#49867](https://github.com/ClickHouse/ClickHouse/pull/49867) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fingerprints to be used instead of key IDs in encrypted disks. This simplifies the configuration of encrypted disks. [#49882](https://github.com/ClickHouse/ClickHouse/pull/49882) ([Vitaly Baranov](https://github.com/vitlibar)). +* Add UUID data type to PostgreSQL. Closes [#49739](https://github.com/ClickHouse/ClickHouse/issues/49739). [#49894](https://github.com/ClickHouse/ClickHouse/pull/49894) ([Nikolay Degterinsky](https://github.com/evillique)). +* Function `toUnixTimestamp` now accepts `Date` and `Date32` arguments. [#49989](https://github.com/ClickHouse/ClickHouse/pull/49989) ([Victor Krasnov](https://github.com/sirvickr)). +* Charge only server memory for dictionaries. [#49995](https://github.com/ClickHouse/ClickHouse/pull/49995) ([Azat Khuzhin](https://github.com/azat)). +* The server will allow using the `SQL_*` settings such as `SQL_AUTO_IS_NULL` as no-ops for MySQL compatibility. This closes [#49927](https://github.com/ClickHouse/ClickHouse/issues/49927). [#50013](https://github.com/ClickHouse/ClickHouse/pull/50013) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Preserve initial_query_id for ON CLUSTER queries, which is useful for introspection (under `distributed_ddl_entry_format_version=5`). [#50015](https://github.com/ClickHouse/ClickHouse/pull/50015) ([Azat Khuzhin](https://github.com/azat)). +* Preserve backward incompatibility for renamed settings by using aliases (`allow_experimental_projection_optimization` for `optimize_use_projections`, `allow_experimental_lightweight_delete` for `enable_lightweight_delete`). [#50044](https://github.com/ClickHouse/ClickHouse/pull/50044) ([Azat Khuzhin](https://github.com/azat)). +* Support passing FQDN through setting my_hostname to register cluster node in keeper. Add setting of invisible to support multi compute groups. A compute group as a cluster, is invisible to other compute groups. [#50186](https://github.com/ClickHouse/ClickHouse/pull/50186) ([Yangkuan Liu](https://github.com/LiuYangkuan)). +* Fix PostgreSQL reading all the data even though `LIMIT n` could be specified. [#50187](https://github.com/ClickHouse/ClickHouse/pull/50187) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Add new profile events for queries with subqueries (`QueriesWithSubqueries`/`SelectQueriesWithSubqueries`/`InsertQueriesWithSubqueries`). [#50204](https://github.com/ClickHouse/ClickHouse/pull/50204) ([Azat Khuzhin](https://github.com/azat)). +* Adding the roles field in the users.xml file, which allows specifying roles with grants via a config file. [#50278](https://github.com/ClickHouse/ClickHouse/pull/50278) ([pufit](https://github.com/pufit)). +* Report `CGroupCpuCfsPeriod` and `CGroupCpuCfsQuota` in AsynchronousMetrics. - Respect cgroup v2 memory limits during server startup. [#50379](https://github.com/ClickHouse/ClickHouse/pull/50379) ([alekar](https://github.com/alekar)). +* Add a signal handler for SIGQUIT to work the same way as SIGINT. Closes [#50298](https://github.com/ClickHouse/ClickHouse/issues/50298). [#50435](https://github.com/ClickHouse/ClickHouse/pull/50435) ([Nikolay Degterinsky](https://github.com/evillique)). +* In case JSON parse fails due to the large size of the object output the last position to allow debugging. [#50474](https://github.com/ClickHouse/ClickHouse/pull/50474) ([Valentin Alexeev](https://github.com/valentinalexeev)). +* Support decimals with not fixed size. Closes [#49130](https://github.com/ClickHouse/ClickHouse/issues/49130). [#50586](https://github.com/ClickHouse/ClickHouse/pull/50586) ([Kruglov Pavel](https://github.com/Avogar)). + +#### Build/Testing/Packaging Improvement +* New and improved `keeper-bench`. Everything can be customized from YAML/XML file: - request generator - each type of request generator can have a specific set of fields - multi requests can be generated just by doing the same under `multi` key - for each request or subrequest in multi a `weight` field can be defined to control distribution - define trees that need to be setup for a test run - hosts can be defined with all timeouts customizable and it's possible to control how many sessions to generate for each host - integers defined with `min_value` and `max_value` fields are random number generators. [#48547](https://github.com/ClickHouse/ClickHouse/pull/48547) ([Antonio Andelic](https://github.com/antonio2368)). +* Io_uring is not supported on macos, don't choose it when running tests on local to avoid occassional failures. [#49250](https://github.com/ClickHouse/ClickHouse/pull/49250) ([Frank Chen](https://github.com/FrankChen021)). +* Support named fault injection for testing. [#49361](https://github.com/ClickHouse/ClickHouse/pull/49361) ([Han Fei](https://github.com/hanfei1991)). +* Allow running ClickHouse in the OS where the `prctl` (process control) syscall is not available, such as AWS Lambda. [#49538](https://github.com/ClickHouse/ClickHouse/pull/49538) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fixed the issue of build conflict between contrib/isa-l and isa-l in qpl [49296](https://github.com/ClickHouse/ClickHouse/issues/49296). [#49584](https://github.com/ClickHouse/ClickHouse/pull/49584) ([jasperzhu](https://github.com/jinjunzh)). +* Utilities are now only build if explicitly requested ("-DENABLE_UTILS=1") instead of by default, this reduces link times in typical development builds. [#49620](https://github.com/ClickHouse/ClickHouse/pull/49620) ([Robert Schulze](https://github.com/rschu1ze)). +* Pull build description of idxd-config into a separate CMake file to avoid accidental removal in future. [#49651](https://github.com/ClickHouse/ClickHouse/pull/49651) ([jasperzhu](https://github.com/jinjunzh)). +* Add CI check with an enabled analyzer in the master. Follow-up [#49562](https://github.com/ClickHouse/ClickHouse/issues/49562). [#49668](https://github.com/ClickHouse/ClickHouse/pull/49668) ([Dmitry Novik](https://github.com/novikd)). +* Switch to LLVM/clang 16. [#49678](https://github.com/ClickHouse/ClickHouse/pull/49678) ([Azat Khuzhin](https://github.com/azat)). +* Allow building ClickHouse with clang-17. [#49851](https://github.com/ClickHouse/ClickHouse/pull/49851) ([Alexey Milovidov](https://github.com/alexey-milovidov)). [#50410](https://github.com/ClickHouse/ClickHouse/pull/50410) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* ClickHouse is now easier to be integrated into other cmake projects. [#49991](https://github.com/ClickHouse/ClickHouse/pull/49991) ([Amos Bird](https://github.com/amosbird)). (Which is strongly discouraged - Alexey Milovidov). +* Fix strange additional QEMU logging after [#47151](https://github.com/ClickHouse/ClickHouse/issues/47151), see https://s3.amazonaws.com/clickhouse-test-reports/50078/a4743996ee4f3583884d07bcd6501df0cfdaa346/stateless_tests__release__databasereplicated__[3_4].html. [#50442](https://github.com/ClickHouse/ClickHouse/pull/50442) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* ClickHouse can work on Linux RISC-V 6.1.22. This closes [#50456](https://github.com/ClickHouse/ClickHouse/issues/50456). [#50457](https://github.com/ClickHouse/ClickHouse/pull/50457) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Bump internal protobuf to v3.18 (fixes bogus CVE-2022-1941). [#50400](https://github.com/ClickHouse/ClickHouse/pull/50400) ([Robert Schulze](https://github.com/rschu1ze)). +* Bump internal libxml2 to v2.10.4 (fixes bogus CVE-2023-28484 and bogus CVE-2023-29469). [#50402](https://github.com/ClickHouse/ClickHouse/pull/50402) ([Robert Schulze](https://github.com/rschu1ze)). +* Bump c-ares to v1.19.1 (bogus CVE-2023-32067, bogus CVE-2023-31130, bogus CVE-2023-31147). [#50403](https://github.com/ClickHouse/ClickHouse/pull/50403) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix bogus CVE-2022-2469 in libgsasl. [#50404](https://github.com/ClickHouse/ClickHouse/pull/50404) ([Robert Schulze](https://github.com/rschu1ze)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* ActionsDAG: fix wrong optimization [#47584](https://github.com/ClickHouse/ClickHouse/pull/47584) ([Salvatore Mesoraca](https://github.com/aiven-sal)). +* Correctly handle concurrent snapshots in Keeper [#48466](https://github.com/ClickHouse/ClickHouse/pull/48466) ([Antonio Andelic](https://github.com/antonio2368)). +* MergeTreeMarksLoader holds DataPart instead of DataPartStorage [#48515](https://github.com/ClickHouse/ClickHouse/pull/48515) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Sequence state fix [#48603](https://github.com/ClickHouse/ClickHouse/pull/48603) ([Ilya Golshtein](https://github.com/ilejn)). +* Back/Restore concurrency check on previous fails [#48726](https://github.com/ClickHouse/ClickHouse/pull/48726) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix Attaching a table with non-existent ZK path does not increase the ReadonlyReplica metric [#48954](https://github.com/ClickHouse/ClickHouse/pull/48954) ([wangxiaobo](https://github.com/wzb5212)). +* Fix possible terminate called for uncaught exception in some places [#49112](https://github.com/ClickHouse/ClickHouse/pull/49112) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix key not found error for queries with multiple StorageJoin [#49137](https://github.com/ClickHouse/ClickHouse/pull/49137) ([vdimir](https://github.com/vdimir)). +* Fix wrong query result when using nullable primary key [#49172](https://github.com/ClickHouse/ClickHouse/pull/49172) ([Duc Canh Le](https://github.com/canhld94)). +* Fix reinterpretAs*() on big endian machines [#49198](https://github.com/ClickHouse/ClickHouse/pull/49198) ([Suzy Wang](https://github.com/SuzyWangIBMer)). +* (Experimental zero-copy replication) Lock zero copy parts more atomically [#49211](https://github.com/ClickHouse/ClickHouse/pull/49211) ([alesapin](https://github.com/alesapin)). +* Fix race on Outdated parts loading [#49223](https://github.com/ClickHouse/ClickHouse/pull/49223) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix all key value is null and group use rollup return wrong answer [#49282](https://github.com/ClickHouse/ClickHouse/pull/49282) ([Shuai li](https://github.com/loneylee)). +* Fix calculating load_factor for HASHED dictionaries with SHARDS [#49319](https://github.com/ClickHouse/ClickHouse/pull/49319) ([Azat Khuzhin](https://github.com/azat)). +* Disallow configuring compression CODECs for alias columns [#49363](https://github.com/ClickHouse/ClickHouse/pull/49363) ([Timur Solodovnikov](https://github.com/tsolodov)). +* Fix bug in removal of existing part directory [#49365](https://github.com/ClickHouse/ClickHouse/pull/49365) ([alesapin](https://github.com/alesapin)). +* Properly fix GCS when HMAC is used [#49390](https://github.com/ClickHouse/ClickHouse/pull/49390) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix fuzz bug when subquery set is not built when reading from remote() [#49425](https://github.com/ClickHouse/ClickHouse/pull/49425) ([Alexander Gololobov](https://github.com/davenger)). +* Invert `shutdown_wait_unfinished_queries` [#49427](https://github.com/ClickHouse/ClickHouse/pull/49427) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* (Experimental zero-copy replication) Fix another zero copy bug [#49473](https://github.com/ClickHouse/ClickHouse/pull/49473) ([alesapin](https://github.com/alesapin)). +* Fix postgres database setting [#49481](https://github.com/ClickHouse/ClickHouse/pull/49481) ([Mal Curtis](https://github.com/snikch)). +* Correctly handle `s3Cluster` arguments [#49490](https://github.com/ClickHouse/ClickHouse/pull/49490) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix bug in TraceCollector destructor. [#49508](https://github.com/ClickHouse/ClickHouse/pull/49508) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Fix AsynchronousReadIndirectBufferFromRemoteFS breaking on short seeks [#49525](https://github.com/ClickHouse/ClickHouse/pull/49525) ([Michael Kolupaev](https://github.com/al13n321)). +* Fix dictionaries loading order [#49560](https://github.com/ClickHouse/ClickHouse/pull/49560) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Forbid the change of data type of Object('json') column [#49563](https://github.com/ClickHouse/ClickHouse/pull/49563) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix stress test (Logical error: Expected 7134 >= 11030) [#49623](https://github.com/ClickHouse/ClickHouse/pull/49623) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix bug in DISTINCT [#49628](https://github.com/ClickHouse/ClickHouse/pull/49628) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix: DISTINCT in order with zero values in non-sorted columns [#49636](https://github.com/ClickHouse/ClickHouse/pull/49636) ([Igor Nikonov](https://github.com/devcrafter)). +* Fix one-off error in big integers found by UBSan with fuzzer [#49645](https://github.com/ClickHouse/ClickHouse/pull/49645) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix reading from sparse columns after restart [#49660](https://github.com/ClickHouse/ClickHouse/pull/49660) ([Anton Popov](https://github.com/CurtizJ)). +* Fix assert in SpanHolder::finish() with fibers [#49673](https://github.com/ClickHouse/ClickHouse/pull/49673) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix short circuit functions and mutations with sparse arguments [#49716](https://github.com/ClickHouse/ClickHouse/pull/49716) ([Anton Popov](https://github.com/CurtizJ)). +* Fix writing appended files to incremental backups [#49725](https://github.com/ClickHouse/ClickHouse/pull/49725) ([Vitaly Baranov](https://github.com/vitlibar)). +* Ignore LWD column in checkPartDynamicColumns [#49737](https://github.com/ClickHouse/ClickHouse/pull/49737) ([Alexander Gololobov](https://github.com/davenger)). +* Fix msan issue in randomStringUTF8(uneven number) [#49750](https://github.com/ClickHouse/ClickHouse/pull/49750) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix aggregate function kolmogorovSmirnovTest [#49768](https://github.com/ClickHouse/ClickHouse/pull/49768) ([FFFFFFFHHHHHHH](https://github.com/FFFFFFFHHHHHHH)). +* Fix settings aliases in native protocol [#49776](https://github.com/ClickHouse/ClickHouse/pull/49776) ([Azat Khuzhin](https://github.com/azat)). +* Fix `arrayMap` with array of tuples with single argument [#49789](https://github.com/ClickHouse/ClickHouse/pull/49789) ([Anton Popov](https://github.com/CurtizJ)). +* Fix per-query IO/BACKUPs throttling settings [#49797](https://github.com/ClickHouse/ClickHouse/pull/49797) ([Azat Khuzhin](https://github.com/azat)). +* Fix setting NULL in profile definition [#49831](https://github.com/ClickHouse/ClickHouse/pull/49831) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix a bug with projections and the aggregate_functions_null_for_empty setting (for query_plan_optimize_projection) [#49873](https://github.com/ClickHouse/ClickHouse/pull/49873) ([Amos Bird](https://github.com/amosbird)). +* Fix processing pending batch for Distributed async INSERT after restart [#49884](https://github.com/ClickHouse/ClickHouse/pull/49884) ([Azat Khuzhin](https://github.com/azat)). +* Fix assertion in CacheMetadata::doCleanup [#49914](https://github.com/ClickHouse/ClickHouse/pull/49914) ([Kseniia Sumarokova](https://github.com/kssenii)). +* fix `is_prefix` in OptimizeRegularExpression [#49919](https://github.com/ClickHouse/ClickHouse/pull/49919) ([Han Fei](https://github.com/hanfei1991)). +* Fix metrics `WriteBufferFromS3Bytes`, `WriteBufferFromS3Microseconds` and `WriteBufferFromS3RequestsErrors` [#49930](https://github.com/ClickHouse/ClickHouse/pull/49930) ([Aleksandr Musorin](https://github.com/AVMusorin)). +* Fix IPv6 encoding in protobuf [#49933](https://github.com/ClickHouse/ClickHouse/pull/49933) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Fix possible Logical error on bad Nullable parsing for text formats [#49960](https://github.com/ClickHouse/ClickHouse/pull/49960) ([Kruglov Pavel](https://github.com/Avogar)). +* Add setting output_format_parquet_compliant_nested_types to produce more compatible Parquet files [#50001](https://github.com/ClickHouse/ClickHouse/pull/50001) ([Michael Kolupaev](https://github.com/al13n321)). +* Fix logical error in stress test "Not enough space to add ..." [#50021](https://github.com/ClickHouse/ClickHouse/pull/50021) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Avoid deadlock when starting table in attach thread of `ReplicatedMergeTree` [#50026](https://github.com/ClickHouse/ClickHouse/pull/50026) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix assert in SpanHolder::finish() with fibers attempt 2 [#50034](https://github.com/ClickHouse/ClickHouse/pull/50034) ([Kruglov Pavel](https://github.com/Avogar)). +* Add proper escaping for DDL OpenTelemetry context serialization [#50045](https://github.com/ClickHouse/ClickHouse/pull/50045) ([Azat Khuzhin](https://github.com/azat)). +* Fix reporting broken projection parts [#50052](https://github.com/ClickHouse/ClickHouse/pull/50052) ([Amos Bird](https://github.com/amosbird)). +* JIT compilation not equals NaN fix [#50056](https://github.com/ClickHouse/ClickHouse/pull/50056) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix crashing in case of Replicated database without arguments [#50058](https://github.com/ClickHouse/ClickHouse/pull/50058) ([Azat Khuzhin](https://github.com/azat)). +* Fix crash with `multiIf` and constant condition and nullable arguments [#50123](https://github.com/ClickHouse/ClickHouse/pull/50123) ([Anton Popov](https://github.com/CurtizJ)). +* Fix invalid index analysis for date related keys [#50153](https://github.com/ClickHouse/ClickHouse/pull/50153) ([Amos Bird](https://github.com/amosbird)). +* do not allow modify order by when there are no order by cols [#50154](https://github.com/ClickHouse/ClickHouse/pull/50154) ([Han Fei](https://github.com/hanfei1991)). +* Fix broken index analysis when binary operator contains a null constant argument [#50177](https://github.com/ClickHouse/ClickHouse/pull/50177) ([Amos Bird](https://github.com/amosbird)). +* clickhouse-client: disallow usage of `--query` and `--queries-file` at the same time [#50210](https://github.com/ClickHouse/ClickHouse/pull/50210) ([Alexey Gerasimchuk](https://github.com/Demilivor)). +* Fix UB for INTO OUTFILE extensions (APPEND / AND STDOUT) and WATCH EVENTS [#50216](https://github.com/ClickHouse/ClickHouse/pull/50216) ([Azat Khuzhin](https://github.com/azat)). +* Fix skipping spaces at end of row in CustomSeparatedIgnoreSpaces format [#50224](https://github.com/ClickHouse/ClickHouse/pull/50224) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix iceberg metadata parsing [#50232](https://github.com/ClickHouse/ClickHouse/pull/50232) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix nested distributed SELECT in WITH clause [#50234](https://github.com/ClickHouse/ClickHouse/pull/50234) ([Azat Khuzhin](https://github.com/azat)). +* Fix msan issue in keyed siphash [#50245](https://github.com/ClickHouse/ClickHouse/pull/50245) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix bugs in Poco sockets in non-blocking mode, use true non-blocking sockets [#50252](https://github.com/ClickHouse/ClickHouse/pull/50252) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix checksum calculation for backup entries [#50264](https://github.com/ClickHouse/ClickHouse/pull/50264) ([Vitaly Baranov](https://github.com/vitlibar)). +* Comparison functions NaN fix [#50287](https://github.com/ClickHouse/ClickHouse/pull/50287) ([Maksim Kita](https://github.com/kitaisreal)). +* JIT aggregation nullable key fix [#50291](https://github.com/ClickHouse/ClickHouse/pull/50291) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix clickhouse-local crashing when writing empty Arrow or Parquet output [#50328](https://github.com/ClickHouse/ClickHouse/pull/50328) ([Michael Kolupaev](https://github.com/al13n321)). +* Fix crash when Pool::Entry::disconnect() is called [#50334](https://github.com/ClickHouse/ClickHouse/pull/50334) ([Val Doroshchuk](https://github.com/valbok)). +* Improved fetch part by holding directory lock longer [#50339](https://github.com/ClickHouse/ClickHouse/pull/50339) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix bitShift* functions with both constant arguments [#50343](https://github.com/ClickHouse/ClickHouse/pull/50343) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix Keeper deadlock on exception when preprocessing requests. [#50387](https://github.com/ClickHouse/ClickHouse/pull/50387) ([frinkr](https://github.com/frinkr)). +* Fix hashing of const integer values [#50421](https://github.com/ClickHouse/ClickHouse/pull/50421) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix merge_tree_min_rows_for_seek/merge_tree_min_bytes_for_seek for data skipping indexes [#50432](https://github.com/ClickHouse/ClickHouse/pull/50432) ([Azat Khuzhin](https://github.com/azat)). +* Limit the number of in-flight tasks for loading outdated parts [#50450](https://github.com/ClickHouse/ClickHouse/pull/50450) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Keeper fix: apply uncommitted state after snapshot install [#50483](https://github.com/ClickHouse/ClickHouse/pull/50483) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix incorrect constant folding [#50536](https://github.com/ClickHouse/ClickHouse/pull/50536) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix logical error in stress test (Not enough space to add ...) [#50583](https://github.com/ClickHouse/ClickHouse/pull/50583) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix converting Null to LowCardinality(Nullable) in values table function [#50637](https://github.com/ClickHouse/ClickHouse/pull/50637) ([Kruglov Pavel](https://github.com/Avogar)). +* Revert invalid RegExpTreeDictionary optimization [#50642](https://github.com/ClickHouse/ClickHouse/pull/50642) ([Johann Gan](https://github.com/johanngan)). + ### ClickHouse release 23.4, 2023-04-26 #### Backward Incompatible Change From e66531affac094567d95d038cfbf8c9ca90027ae Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 8 Jun 2023 12:50:26 +0200 Subject: [PATCH 0650/1072] Update test --- tests/queries/0_stateless/02782_bitmap_overflow.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02782_bitmap_overflow.sql b/tests/queries/0_stateless/02782_bitmap_overflow.sql index 656a3e7c144..71ddce5c3b9 100644 --- a/tests/queries/0_stateless/02782_bitmap_overflow.sql +++ b/tests/queries/0_stateless/02782_bitmap_overflow.sql @@ -1,2 +1,4 @@ +-- Tags: no-msan, no-asan + select unhex('0181808080908380808000')::AggregateFunction(groupBitmap, UInt64); -- {serverError TOO_LARGE_ARRAY_SIZE} From 44ee530b4d50ed09c39133fb81e827e6c3402a31 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 8 Jun 2023 11:04:01 +0000 Subject: [PATCH 0651/1072] Kill gdb in clickhouse-test before getting stacktraces --- tests/clickhouse-test | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index d8fad77b95c..a7ec7b15e16 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -340,9 +340,22 @@ def get_transactions_list(args): return f"Cannot get list of transactions: {e}" +def kill_gdb(): + for i in range(5): + code = subprocess.call("kill -TERM $(pidof gdb)", shell=True, stderr=subprocess.STDOUT, timeout=30) + if code != 0: + time.sleep(i) + else: + break + # collect server stacktraces using gdb def get_stacktraces_from_gdb(server_pid): try: + # We could attach gdb to clickhouse-server before running some tests + # to print stacktraces of all crashes even if clickhouse cannot print it for some reason. + # We should kill existing gdb if any before starting new one. + kill_gdb() + cmd = f"gdb -batch -ex 'thread apply all backtrace' -p {server_pid}" return subprocess.check_output(cmd, shell=True).decode("utf-8") except Exception as e: From 54414be47b33f352bea49ee51cc2502f1f41b21d Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 8 Jun 2023 11:14:43 +0000 Subject: [PATCH 0652/1072] Better --- tests/clickhouse-test | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index a7ec7b15e16..56cf2f0ce0f 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -340,9 +340,14 @@ def get_transactions_list(args): return f"Cannot get list of transactions: {e}" -def kill_gdb(): +def kill_gdb_if_any(): + # Check if we have running gdb. + code = subprocess.call("pidof gdb", shell=True) + if code != 0: + return + for i in range(5): - code = subprocess.call("kill -TERM $(pidof gdb)", shell=True, stderr=subprocess.STDOUT, timeout=30) + code = subprocess.call("kill -TERM $(pidof gdb)", shell=True, timeout=30) if code != 0: time.sleep(i) else: @@ -354,7 +359,7 @@ def get_stacktraces_from_gdb(server_pid): # We could attach gdb to clickhouse-server before running some tests # to print stacktraces of all crashes even if clickhouse cannot print it for some reason. # We should kill existing gdb if any before starting new one. - kill_gdb() + kill_gdb_if_any() cmd = f"gdb -batch -ex 'thread apply all backtrace' -p {server_pid}" return subprocess.check_output(cmd, shell=True).decode("utf-8") From c37b80593c0db79b064bf5e54a817ef90238a343 Mon Sep 17 00:00:00 2001 From: Val Doroshchuk Date: Thu, 8 Jun 2023 13:22:55 +0200 Subject: [PATCH 0653/1072] MaterializedMySQL: Add support of `TRUNCATE db.table` (#50624) Additional to `TRUNCATE TABLE db.table`. Co-authored-by: Alexander Tokmakov --- src/Parsers/MySQL/ASTDropQuery.cpp | 3 +- .../materialize_with_ddl.py | 40 +++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/src/Parsers/MySQL/ASTDropQuery.cpp b/src/Parsers/MySQL/ASTDropQuery.cpp index fb76d93363a..890451e3e55 100644 --- a/src/Parsers/MySQL/ASTDropQuery.cpp +++ b/src/Parsers/MySQL/ASTDropQuery.cpp @@ -44,8 +44,9 @@ bool ParserDropQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected & ex bool if_exists = false; bool is_truncate = false; - if (s_truncate.ignore(pos, expected) && s_table.ignore(pos, expected)) + if (s_truncate.ignore(pos, expected)) { + s_table.ignore(pos, expected); is_truncate = true; query->kind = ASTDropQuery::Kind::Table; ASTDropQuery::QualifiedName name; diff --git a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py index 97e2de49ceb..2bbbe9a3f13 100644 --- a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py +++ b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py @@ -379,6 +379,46 @@ def drop_table_with_materialized_mysql_database( "", ) + mysql_node.query( + "CREATE TABLE test_database_drop.test_table_3 (id INT NOT NULL PRIMARY KEY) ENGINE = InnoDB" + ) + mysql_node.query("INSERT INTO test_database_drop.test_table_3 VALUES(1), (2)") + check_query( + clickhouse_node, + "SHOW TABLES FROM test_database_drop FORMAT TSV", + "test_table_2\ntest_table_3\n", + ) + check_query( + clickhouse_node, + "SELECT * FROM test_database_drop.test_table_3 ORDER BY id FORMAT TSV", + "1\n2\n", + ) + mysql_node.query("TRUNCATE test_database_drop.test_table_3") + check_query( + clickhouse_node, + "SELECT * FROM test_database_drop.test_table_3 ORDER BY id FORMAT TSV", + "", + ) + + mysql_node.query( + "CREATE TABLE test_database_drop.test_table_4 (id INT NOT NULL PRIMARY KEY) ENGINE = InnoDB" + ) + mysql_node.query("INSERT INTO test_database_drop.test_table_4 VALUES(1), (2)") + check_query( + clickhouse_node, + "SELECT * FROM test_database_drop.test_table_4 ORDER BY id FORMAT TSV", + "1\n2\n", + ) + with mysql_node.alloc_connection() as mysql: + mysql.query("USE test_database_drop") + mysql.query("TRUNCATE test_table_4") + + check_query( + clickhouse_node, + "SELECT * FROM test_database_drop.test_table_4 ORDER BY id FORMAT TSV", + "", + ) + clickhouse_node.query("DROP DATABASE test_database_drop") mysql_node.query("DROP DATABASE test_database_drop") From 6b49816079016e362110da78e4e8b660894e9b6a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 8 Jun 2023 14:25:49 +0300 Subject: [PATCH 0654/1072] Update CHANGELOG.md Co-authored-by: Alexander Gololobov <440544+davenger@users.noreply.github.com> --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f2ffdad9a7c..959a7bad5dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -210,7 +210,7 @@ * Fix assert in SpanHolder::finish() with fibers [#49673](https://github.com/ClickHouse/ClickHouse/pull/49673) ([Kruglov Pavel](https://github.com/Avogar)). * Fix short circuit functions and mutations with sparse arguments [#49716](https://github.com/ClickHouse/ClickHouse/pull/49716) ([Anton Popov](https://github.com/CurtizJ)). * Fix writing appended files to incremental backups [#49725](https://github.com/ClickHouse/ClickHouse/pull/49725) ([Vitaly Baranov](https://github.com/vitlibar)). -* Ignore LWD column in checkPartDynamicColumns [#49737](https://github.com/ClickHouse/ClickHouse/pull/49737) ([Alexander Gololobov](https://github.com/davenger)). +* Fix "There is no physical column _row_exists in table" error occurring during lightweight delete mutation on a table with Object column. [#49737](https://github.com/ClickHouse/ClickHouse/pull/49737) ([Alexander Gololobov](https://github.com/davenger)). * Fix msan issue in randomStringUTF8(uneven number) [#49750](https://github.com/ClickHouse/ClickHouse/pull/49750) ([Robert Schulze](https://github.com/rschu1ze)). * Fix aggregate function kolmogorovSmirnovTest [#49768](https://github.com/ClickHouse/ClickHouse/pull/49768) ([FFFFFFFHHHHHHH](https://github.com/FFFFFFFHHHHHHH)). * Fix settings aliases in native protocol [#49776](https://github.com/ClickHouse/ClickHouse/pull/49776) ([Azat Khuzhin](https://github.com/azat)). From c6b2ed234dcde2802020f730dc84101d7ad84f65 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 8 Jun 2023 13:27:13 +0200 Subject: [PATCH 0655/1072] Fix typos --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f2ffdad9a7c..c674af5852a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,7 @@ * The experimental feature "in-memory data parts" is removed. The data format is still supported, but the settings are no-op, and compact or wide parts will be used instead. This closes [#45409](https://github.com/ClickHouse/ClickHouse/issues/45409). [#49429](https://github.com/ClickHouse/ClickHouse/pull/49429) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Changed default values of settings `parallelize_output_from_storages` and `input_format_parquet_preserve_order`. This allows ClickHouse to reorder rows when reading from files (e.g. CSV or Parquet), greatly improving performance in many cases. To restore the old behavior of preserving order, use `parallelize_output_from_storages = 0`, `input_format_parquet_preserve_order = 1`. [#49479](https://github.com/ClickHouse/ClickHouse/pull/49479) ([Michael Kolupaev](https://github.com/al13n321)). * Make projections production-ready. Add the `optimize_use_projections` setting to control whether the projections will be selected for SELECT queries. The setting `allow_experimental_projection_optimization` is obsolete and does nothing. [#49719](https://github.com/ClickHouse/ClickHouse/pull/49719) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Mark `joinGet` as non deterministic (so as `dictGet`). It allows using them in mutations without an extra setting. [#49843](https://github.com/ClickHouse/ClickHouse/pull/49843) ([Azat Khuzhin](https://github.com/azat)). +* Mark `joinGet` as non-deterministic (so as `dictGet`). It allows using them in mutations without an extra setting. [#49843](https://github.com/ClickHouse/ClickHouse/pull/49843) ([Azat Khuzhin](https://github.com/azat)). * Revert the "`groupArray` returns cannot be nullable" change (due to binary compatibility breakage for `groupArray`/`groupArrayLast`/`groupArraySample` over `Nullable` types, which likely will lead to `TOO_LARGE_ARRAY_SIZE` or `CANNOT_READ_ALL_DATA`). [#49971](https://github.com/ClickHouse/ClickHouse/pull/49971) ([Azat Khuzhin](https://github.com/azat)). * Setting `enable_memory_bound_merging_of_aggregation_results` is enabled by default. If you update from version prior to 22.12, we recommend to set this flag to `false` until update is finished. [#50319](https://github.com/ClickHouse/ClickHouse/pull/50319) ([Nikita Taranov](https://github.com/nickitat)). @@ -68,7 +68,7 @@ * Improve performance of BLAKE3 by 11% by enabling LTO for Rust. [#49600](https://github.com/ClickHouse/ClickHouse/pull/49600) ([Azat Khuzhin](https://github.com/azat)). Now it is on par with C++. * Optimize the structure of the `system.opentelemetry_span_log`. Use `LowCardinality` where appropriate. Although this table is generally stupid (it is using the Map data type even for common attributes), it will be slightly better. [#49647](https://github.com/ClickHouse/ClickHouse/pull/49647) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Try to reserve hash table's size in `grace_hash` join. [#49816](https://github.com/ClickHouse/ClickHouse/pull/49816) ([lgbo](https://github.com/lgbo-ustc)). -* As is addresed in issue [#49748](https://github.com/ClickHouse/ClickHouse/issues/49748), the predicates with date converters, such as `toYear`, `toYYYYMM`, could be rewritten with the equivalent date (YYYY-MM-DD) comparisons at the AST level. And this transformation could bring performance improvement as it is free from the expensive date converter and the comparison between dates (or integers in the low level representation) is quite low-cost. The [prototype](https://github.com/ZhiguoZh/ClickHouse/commit/c7f1753f0c9363a19d95fa46f1cfed1d9f505ee0) shows that, with all identified date converters optimized, the overall QPS of the 13 queries is enhanced by **~11%** on the ICX server (Intel Xeon Platinum 8380 CPU, 80 cores, 160 threads). [#50062](https://github.com/ClickHouse/ClickHouse/pull/50062) [#50307](https://github.com/ClickHouse/ClickHouse/pull/50307) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). +* As is addressed in issue [#49748](https://github.com/ClickHouse/ClickHouse/issues/49748), the predicates with date converters, such as `toYear`, `toYYYYMM`, could be rewritten with the equivalent date (YYYY-MM-DD) comparisons at the AST level. And this transformation could bring performance improvement as it is free from the expensive date converter and the comparison between dates (or integers in the low level representation) is quite low-cost. The [prototype](https://github.com/ZhiguoZh/ClickHouse/commit/c7f1753f0c9363a19d95fa46f1cfed1d9f505ee0) shows that, with all identified date converters optimized, the overall QPS of the 13 queries is enhanced by **~11%** on the ICX server (Intel Xeon Platinum 8380 CPU, 80 cores, 160 threads). [#50062](https://github.com/ClickHouse/ClickHouse/pull/50062) [#50307](https://github.com/ClickHouse/ClickHouse/pull/50307) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). * Parallel merge of `uniqExactIf` states. Closes [#49885](https://github.com/ClickHouse/ClickHouse/issues/49885). [#50285](https://github.com/ClickHouse/ClickHouse/pull/50285) ([flynn](https://github.com/ucasfl)). * Keeper improvement: add `CheckNotExists` request to Keeper, which allows to improve the performance of Replicated tables. [#48897](https://github.com/ClickHouse/ClickHouse/pull/48897) ([Antonio Andelic](https://github.com/antonio2368)). * Keeper performance improvements: avoid serializing same request twice while processing. Cache deserialization results of large requests. Controlled by new coordination setting `min_request_size_for_cache`. [#49004](https://github.com/ClickHouse/ClickHouse/pull/49004) ([Antonio Andelic](https://github.com/antonio2368)). From 2aaaec2abebe4860120bd5db1cf6a1b38c28a12b Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 8 Jun 2023 11:51:57 +0200 Subject: [PATCH 0656/1072] Catch issues with dockerd during the build --- tests/ci/build_check.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py index 4bc61c79fc0..35b98a7c3bb 100644 --- a/tests/ci/build_check.py +++ b/tests/ci/build_check.py @@ -343,6 +343,15 @@ def main(): f"sudo chown -R ubuntu:ubuntu {build_output_path}", shell=True ) logging.info("Build finished with %s, log path %s", success, log_path) + if not success: + # We check if docker works, because if it's down, it's infrastructure + try: + subprocess.check_call("docker info", shell=True) + except subprocess.CalledProcessError: + logging.error( + "The dockerd looks down, won't upload anything and generate report" + ) + sys.exit(1) # FIXME performance performance_urls = [] From 7079b4c885656fba788400002f012d3ff43e01de Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Thu, 8 Jun 2023 12:40:16 +0300 Subject: [PATCH 0657/1072] ReverseTransform small improvement --- src/Processors/Transforms/ReverseTransform.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Processors/Transforms/ReverseTransform.cpp b/src/Processors/Transforms/ReverseTransform.cpp index 98f2bf54aa5..66b774ab50d 100644 --- a/src/Processors/Transforms/ReverseTransform.cpp +++ b/src/Processors/Transforms/ReverseTransform.cpp @@ -6,11 +6,11 @@ namespace DB void ReverseTransform::transform(Chunk & chunk) { - IColumn::Permutation permutation; - size_t num_rows = chunk.getNumRows(); + IColumn::Permutation permutation(num_rows); + for (size_t i = 0; i < num_rows; ++i) - permutation.emplace_back(num_rows - 1 - i); + permutation[i] = num_rows - 1 - i; auto columns = chunk.detachColumns(); From 64783194138fe4f823da38deb749bef44e15bf68 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Thu, 8 Jun 2023 09:24:11 -0300 Subject: [PATCH 0658/1072] Update CHANGELOG.md --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a5b6dd85941..4cf1023cac2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -68,7 +68,6 @@ * Improve performance of BLAKE3 by 11% by enabling LTO for Rust. [#49600](https://github.com/ClickHouse/ClickHouse/pull/49600) ([Azat Khuzhin](https://github.com/azat)). Now it is on par with C++. * Optimize the structure of the `system.opentelemetry_span_log`. Use `LowCardinality` where appropriate. Although this table is generally stupid (it is using the Map data type even for common attributes), it will be slightly better. [#49647](https://github.com/ClickHouse/ClickHouse/pull/49647) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Try to reserve hash table's size in `grace_hash` join. [#49816](https://github.com/ClickHouse/ClickHouse/pull/49816) ([lgbo](https://github.com/lgbo-ustc)). -* As is addressed in issue [#49748](https://github.com/ClickHouse/ClickHouse/issues/49748), the predicates with date converters, such as `toYear`, `toYYYYMM`, could be rewritten with the equivalent date (YYYY-MM-DD) comparisons at the AST level. And this transformation could bring performance improvement as it is free from the expensive date converter and the comparison between dates (or integers in the low level representation) is quite low-cost. The [prototype](https://github.com/ZhiguoZh/ClickHouse/commit/c7f1753f0c9363a19d95fa46f1cfed1d9f505ee0) shows that, with all identified date converters optimized, the overall QPS of the 13 queries is enhanced by **~11%** on the ICX server (Intel Xeon Platinum 8380 CPU, 80 cores, 160 threads). [#50062](https://github.com/ClickHouse/ClickHouse/pull/50062) [#50307](https://github.com/ClickHouse/ClickHouse/pull/50307) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). * Parallel merge of `uniqExactIf` states. Closes [#49885](https://github.com/ClickHouse/ClickHouse/issues/49885). [#50285](https://github.com/ClickHouse/ClickHouse/pull/50285) ([flynn](https://github.com/ucasfl)). * Keeper improvement: add `CheckNotExists` request to Keeper, which allows to improve the performance of Replicated tables. [#48897](https://github.com/ClickHouse/ClickHouse/pull/48897) ([Antonio Andelic](https://github.com/antonio2368)). * Keeper performance improvements: avoid serializing same request twice while processing. Cache deserialization results of large requests. Controlled by new coordination setting `min_request_size_for_cache`. [#49004](https://github.com/ClickHouse/ClickHouse/pull/49004) ([Antonio Andelic](https://github.com/antonio2368)). From 1cbcd2f2ef4032cfb718433befbd8742c1e4b9cf Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 8 Jun 2023 13:39:05 +0200 Subject: [PATCH 0659/1072] Refactor reading from object storages --- src/Disks/IO/ReadBufferFromRemoteFSGather.cpp | 59 ++++++--- src/Disks/IO/ReadBufferFromRemoteFSGather.h | 10 +- src/Disks/IO/ReadBufferFromWebServer.h | 2 - .../IO/ReadIndirectBufferFromRemoteFS.cpp | 118 ------------------ src/Disks/IO/ReadIndirectBufferFromRemoteFS.h | 46 ------- .../AzureBlobStorage/AzureObjectStorage.cpp | 44 ++++--- .../AzureBlobStorage/AzureObjectStorage.h | 1 - .../ObjectStorages/HDFS/HDFSObjectStorage.cpp | 8 +- .../Local/LocalObjectStorage.cpp | 39 +++--- .../ObjectStorages/S3/S3ObjectStorage.cpp | 45 ++++--- .../ObjectStorages/Web/WebObjectStorage.cpp | 43 ++++--- src/IO/SeekAvoidingReadBuffer.cpp | 35 ------ src/IO/SeekAvoidingReadBuffer.h | 26 ---- src/Storages/StorageS3.cpp | 2 +- 14 files changed, 149 insertions(+), 329 deletions(-) delete mode 100644 src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp delete mode 100644 src/Disks/IO/ReadIndirectBufferFromRemoteFS.h delete mode 100644 src/IO/SeekAvoidingReadBuffer.cpp delete mode 100644 src/IO/SeekAvoidingReadBuffer.h diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp index 04030fe5f8f..eb9c509e459 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp @@ -22,13 +22,15 @@ ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather( ReadBufferCreator && read_buffer_creator_, const StoredObjects & blobs_to_read_, const ReadSettings & settings_, - std::shared_ptr cache_log_) - : ReadBufferFromFileBase(0, nullptr, 0) + std::shared_ptr cache_log_, + bool use_external_buffer_) + : ReadBufferFromFileBase(use_external_buffer_ ? 0 : settings_.remote_fs_buffer_size, nullptr, 0) , settings(settings_) , blobs_to_read(blobs_to_read_) , read_buffer_creator(std::move(read_buffer_creator_)) , cache_log(settings.enable_filesystem_cache_log ? cache_log_ : nullptr) , query_id(CurrentThread::isInitialized() && CurrentThread::get().getQueryContext() != nullptr ? CurrentThread::getQueryId() : "") + , use_external_buffer(use_external_buffer_) , log(&Poco::Logger::get("ReadBufferFromRemoteFSGather")) { if (!blobs_to_read.empty()) @@ -36,7 +38,9 @@ ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather( with_cache = settings.remote_fs_cache && settings.enable_filesystem_cache - && (!query_id.empty() || settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache || !settings.avoid_readthrough_cache_outside_query_context); + && (!query_id.empty() + || settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache + || !settings.avoid_readthrough_cache_outside_query_context); } SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(const StoredObject & object) @@ -235,22 +239,49 @@ void ReadBufferFromRemoteFSGather::reset() off_t ReadBufferFromRemoteFSGather::seek(off_t offset, int whence) { - if (whence != SEEK_SET) - throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only seeking with SEEK_SET is allowed"); + if (offset == getPosition() && whence == SEEK_SET) + return offset; + + if (whence != SEEK_SET) + throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET mode is allowed."); + + if (use_external_buffer) + { + /// In case use_external_buffer == true, the buffer manages seeks itself. + reset(); + } + else + { + if (!working_buffer.empty() + && static_cast(offset) >= file_offset_of_buffer_end - working_buffer.size() + && static_cast(offset) < file_offset_of_buffer_end) + { + pos = working_buffer.end() - (file_offset_of_buffer_end - offset); + assert(pos >= working_buffer.begin()); + assert(pos < working_buffer.end()); + + return getPosition(); + } + + off_t position = getPosition(); + if (current_buf && offset > position) + { + size_t diff = offset - position; + if (diff < settings.remote_read_min_bytes_for_seek) + { + ignore(diff); + return offset; + } + } + + resetWorkingBuffer(); + reset(); + } - reset(); file_offset_of_buffer_end = offset; return file_offset_of_buffer_end; } -size_t ReadBufferFromRemoteFSGather::getImplementationBufferOffset() const -{ - if (!current_buf) - return file_offset_of_buffer_end; - - return current_buf->getFileOffsetOfBufferEnd(); -} - ReadBufferFromRemoteFSGather::~ReadBufferFromRemoteFSGather() { if (!with_cache) diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.h b/src/Disks/IO/ReadBufferFromRemoteFSGather.h index 39b81d6f9ac..cb98ac6d9f3 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.h +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.h @@ -27,7 +27,8 @@ public: ReadBufferCreator && read_buffer_creator_, const StoredObjects & blobs_to_read_, const ReadSettings & settings_, - std::shared_ptr cache_log_); + std::shared_ptr cache_log_, + bool use_external_buffer_); ~ReadBufferFromRemoteFSGather() override; @@ -37,16 +38,14 @@ public: void setReadUntilPosition(size_t position) override; + void setReadUntilEnd() override { return setReadUntilPosition(getFileSize()); } + IAsynchronousReader::Result readInto(char * data, size_t size, size_t offset, size_t ignore) override; size_t getFileSize() override { return getTotalSize(blobs_to_read); } size_t getFileOffsetOfBufferEnd() const override { return file_offset_of_buffer_end; } - bool initialized() const { return current_buf != nullptr; } - - size_t getImplementationBufferOffset() const; - off_t seek(off_t offset, int whence) override; off_t getPosition() override { return file_offset_of_buffer_end - available() + bytes_to_ignore; } @@ -71,6 +70,7 @@ private: const ReadBufferCreator read_buffer_creator; const std::shared_ptr cache_log; const String query_id; + const bool use_external_buffer; bool with_cache; size_t read_until_position = 0; diff --git a/src/Disks/IO/ReadBufferFromWebServer.h b/src/Disks/IO/ReadBufferFromWebServer.h index dd9cf63224f..fa899cf2c5e 100644 --- a/src/Disks/IO/ReadBufferFromWebServer.h +++ b/src/Disks/IO/ReadBufferFromWebServer.h @@ -12,8 +12,6 @@ namespace DB /* Read buffer, which reads via http, but is used as ReadBufferFromFileBase. * Used to read files, hosted on a web server with static files. - * - * Usage: ReadIndirectBufferFromRemoteFS -> SeekAvoidingReadBuffer -> ReadBufferFromWebServer -> ReadWriteBufferFromHTTP. */ class ReadBufferFromWebServer : public ReadBufferFromFileBase { diff --git a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp deleted file mode 100644 index a559b47f2cc..00000000000 --- a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp +++ /dev/null @@ -1,118 +0,0 @@ -#include "ReadIndirectBufferFromRemoteFS.h" - -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int CANNOT_SEEK_THROUGH_FILE; -} - - -ReadIndirectBufferFromRemoteFS::ReadIndirectBufferFromRemoteFS( - std::shared_ptr impl_, const ReadSettings & settings) - : ReadBufferFromFileBase(settings.remote_fs_buffer_size, nullptr, 0) - , impl(impl_) - , read_settings(settings) -{ -} - -size_t ReadIndirectBufferFromRemoteFS::getFileSize() -{ - return impl->getFileSize(); -} - -off_t ReadIndirectBufferFromRemoteFS::getPosition() -{ - return impl->file_offset_of_buffer_end - available(); -} - - -String ReadIndirectBufferFromRemoteFS::getFileName() const -{ - return impl->getFileName(); -} - - -void ReadIndirectBufferFromRemoteFS::setReadUntilPosition(size_t position) -{ - impl->setReadUntilPosition(position); -} - - -void ReadIndirectBufferFromRemoteFS::setReadUntilEnd() -{ - impl->setReadUntilPosition(impl->getFileSize()); -} - - -off_t ReadIndirectBufferFromRemoteFS::seek(off_t offset_, int whence) -{ - if (whence == SEEK_CUR) - { - /// If position within current working buffer - shift pos. - if (!working_buffer.empty() && size_t(getPosition() + offset_) < impl->file_offset_of_buffer_end) - { - pos += offset_; - return getPosition(); - } - else - { - impl->file_offset_of_buffer_end += offset_; - } - } - else if (whence == SEEK_SET) - { - /// If position within current working buffer - shift pos. - if (!working_buffer.empty() - && size_t(offset_) >= impl->file_offset_of_buffer_end - working_buffer.size() - && size_t(offset_) < impl->file_offset_of_buffer_end) - { - pos = working_buffer.end() - (impl->file_offset_of_buffer_end - offset_); - return getPosition(); - } - else - { - impl->file_offset_of_buffer_end = offset_; - } - } - else - throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET or SEEK_CUR modes are allowed."); - - impl->seek(impl->file_offset_of_buffer_end, SEEK_SET); - resetWorkingBuffer(); - - file_offset_of_buffer_end = impl->file_offset_of_buffer_end; - return impl->file_offset_of_buffer_end; -} - - -bool ReadIndirectBufferFromRemoteFS::nextImpl() -{ - chassert(internal_buffer.size() == read_settings.remote_fs_buffer_size); - chassert(file_offset_of_buffer_end <= impl->getFileSize()); - - auto [size, offset, _] = impl->readInto(internal_buffer.begin(), internal_buffer.size(), file_offset_of_buffer_end, /* ignore */0); - - chassert(offset <= size); - chassert(size <= internal_buffer.size()); - - size_t bytes_read = size - offset; - if (bytes_read) - working_buffer = Buffer(internal_buffer.begin() + offset, internal_buffer.begin() + size); - - file_offset_of_buffer_end = impl->getFileOffsetOfBufferEnd(); - - /// In case of multiple files for the same file in clickhouse (i.e. log family) - /// file_offset_of_buffer_end will not match getImplementationBufferOffset() - /// so we use [impl->getImplementationBufferOffset(), impl->getFileSize()] - chassert(file_offset_of_buffer_end >= impl->getImplementationBufferOffset()); - chassert(file_offset_of_buffer_end <= impl->getFileSize()); - - return bytes_read; -} - -} diff --git a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h deleted file mode 100644 index 19647b1fa39..00000000000 --- a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h +++ /dev/null @@ -1,46 +0,0 @@ -#pragma once - -#include "config.h" -#include -#include -#include - - -namespace DB -{ - -class ReadBufferFromRemoteFSGather; - -/** -* Reads data from S3/HDFS/Web using stored paths in metadata. -* There is asynchronous version of this class -- AsynchronousReadIndirectBufferFromRemoteFS. -*/ -class ReadIndirectBufferFromRemoteFS : public ReadBufferFromFileBase -{ - -public: - explicit ReadIndirectBufferFromRemoteFS(std::shared_ptr impl_, const ReadSettings & settings); - - off_t seek(off_t offset_, int whence) override; - - off_t getPosition() override; - - String getFileName() const override; - - void setReadUntilPosition(size_t position) override; - - void setReadUntilEnd() override; - - size_t getFileSize() override; - -private: - bool nextImpl() override; - - std::shared_ptr impl; - - ReadSettings read_settings; - - size_t file_offset_of_buffer_end = 0; -}; - -} diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 23a0da39dd3..3636c5780fb 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include #include @@ -218,24 +217,33 @@ std::unique_ptr AzureObjectStorage::readObjects( /// NOL read_until_position); }; - auto reader_impl = std::make_unique( - std::move(read_buffer_creator), - objects, - disk_read_settings, - global_context->getFilesystemCacheLog()); + switch (read_settings.remote_fs_method) + { + case RemoteFSReadMethod::read: + { + return std::make_unique( + std::move(read_buffer_creator), + objects, + disk_read_settings, + global_context->getFilesystemCacheLog(), + /* use_external_buffer */false); - if (disk_read_settings.remote_fs_method == RemoteFSReadMethod::threadpool) - { - auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); - return std::make_unique( - std::move(reader_impl), reader, disk_read_settings, - global_context->getAsyncReadCounters(), - global_context->getFilesystemReadPrefetchesLog()); - } - else - { - auto buf = std::make_unique(std::move(reader_impl), disk_read_settings); - return std::make_unique(std::move(buf), settings_ptr->min_bytes_for_seek); + } + case RemoteFSReadMethod::threadpool: + { + auto impl = std::make_unique( + std::move(read_buffer_creator), + objects, + disk_read_settings, + global_context->getFilesystemCacheLog(), + /* use_external_buffer */true); + + auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); + return std::make_unique( + std::move(impl), reader, disk_read_settings, + global_context->getAsyncReadCounters(), + global_context->getFilesystemReadPrefetchesLog()); + } } } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 5b08ceb80e3..2fbd1514abd 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -5,7 +5,6 @@ #include #include -#include #include #include diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index a3092bc6f12..60230ce2fb0 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -1,13 +1,10 @@ #include -#include #include - #include #include #include -#include #include #include @@ -72,9 +69,8 @@ std::unique_ptr HDFSObjectStorage::readObjects( /// NOLI hdfs_uri, hdfs_path, config, disk_read_settings, /* read_until_position */0, /* use_external_buffer */true); }; - auto hdfs_impl = std::make_unique(std::move(read_buffer_creator), objects, disk_read_settings, nullptr); - auto buf = std::make_unique(std::move(hdfs_impl), read_settings); - return std::make_unique(std::move(buf), settings->min_bytes_for_seek); + return std::make_unique( + std::move(read_buffer_creator), objects, disk_read_settings, nullptr, /* use_external_buffer */false); } std::unique_ptr HDFSObjectStorage::writeObject( /// NOLINT diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp index 05c0c8f3961..69ccf309096 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp @@ -4,11 +4,9 @@ #include #include #include -#include #include #include #include -#include #include #include #include @@ -59,25 +57,26 @@ std::unique_ptr LocalObjectStorage::readObjects( /// NOL return createReadBufferFromFileBase(file_path, modified_settings, read_hint, file_size); }; - auto impl = std::make_unique( - std::move(read_buffer_creator), objects, modified_settings, - global_context->getFilesystemCacheLog()); + switch (read_settings.remote_fs_method) + { + case RemoteFSReadMethod::read: + { + return std::make_unique( + std::move(read_buffer_creator), objects, modified_settings, + global_context->getFilesystemCacheLog(), /* use_external_buffer */false); + } + case RemoteFSReadMethod::threadpool: + { + auto impl = std::make_unique( + std::move(read_buffer_creator), objects, modified_settings, + global_context->getFilesystemCacheLog(), /* use_external_buffer */true); - /// We use `remove_fs_method` (not `local_fs_method`) because we are about to use - /// AsynchronousBoundedReadBuffer which works by the remote_fs_* settings. - if (modified_settings.remote_fs_method == RemoteFSReadMethod::threadpool) - { - auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); - return std::make_unique( - std::move(impl), reader, modified_settings, - global_context->getAsyncReadCounters(), - global_context->getFilesystemReadPrefetchesLog()); - } - else - { - auto buf = std::make_unique(std::move(impl), modified_settings); - return std::make_unique( - std::move(buf), modified_settings.remote_read_min_bytes_for_seek); + auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); + return std::make_unique( + std::move(impl), reader, read_settings, + global_context->getAsyncReadCounters(), + global_context->getFilesystemReadPrefetchesLog()); + } } } diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index d19be20f920..e48924326e1 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -8,11 +8,9 @@ #include #include #include -#include #include #include #include -#include #include #include #include @@ -182,24 +180,33 @@ std::unique_ptr S3ObjectStorage::readObjects( /// NOLINT /* restricted_seek */true); }; - auto s3_impl = std::make_unique( - std::move(read_buffer_creator), - objects, - disk_read_settings, - global_context->getFilesystemCacheLog()); + switch (read_settings.remote_fs_method) + { + case RemoteFSReadMethod::read: + { + return std::make_unique( + std::move(read_buffer_creator), + objects, + disk_read_settings, + global_context->getFilesystemCacheLog(), + /* use_external_buffer */false); - if (read_settings.remote_fs_method == RemoteFSReadMethod::threadpool) - { - auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); - return std::make_unique( - std::move(s3_impl), reader, disk_read_settings, - global_context->getAsyncReadCounters(), - global_context->getFilesystemReadPrefetchesLog()); - } - else - { - auto buf = std::make_unique(std::move(s3_impl), disk_read_settings); - return std::make_unique(std::move(buf), settings_ptr->min_bytes_for_seek); + } + case RemoteFSReadMethod::threadpool: + { + auto impl = std::make_unique( + std::move(read_buffer_creator), + objects, + disk_read_settings, + global_context->getFilesystemCacheLog(), + /* use_external_buffer */true); + + auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); + return std::make_unique( + std::move(impl), reader, disk_read_settings, + global_context->getAsyncReadCounters(), + global_context->getFilesystemReadPrefetchesLog()); + } } } diff --git a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp index 4f34f3eed9c..690a0d3372c 100644 --- a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp @@ -4,11 +4,9 @@ #include #include -#include #include #include -#include #include #include #include @@ -181,24 +179,33 @@ std::unique_ptr WebObjectStorage::readObject( /// NOLINT }; auto global_context = Context::getGlobalContextInstance(); - auto web_impl = std::make_unique( - std::move(read_buffer_creator), - StoredObjects{object}, - read_settings, - global_context->getFilesystemCacheLog()); - if (read_settings.remote_fs_method == RemoteFSReadMethod::threadpool) + switch (read_settings.remote_fs_method) { - auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); - return std::make_unique( - std::move(web_impl), reader, read_settings, - global_context->getAsyncReadCounters(), - global_context->getFilesystemReadPrefetchesLog()); - } - else - { - auto buf = std::make_unique(std::move(web_impl), read_settings); - return std::make_unique(std::move(buf), min_bytes_for_seek); + case RemoteFSReadMethod::read: + { + return std::make_unique( + std::move(read_buffer_creator), + StoredObjects{object}, + read_settings, + global_context->getFilesystemCacheLog(), + /* use_external_buffer */false); + } + case RemoteFSReadMethod::threadpool: + { + auto impl = std::make_unique( + std::move(read_buffer_creator), + StoredObjects{object}, + read_settings, + global_context->getFilesystemCacheLog(), + /* use_external_buffer */true); + + auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); + return std::make_unique( + std::move(impl), reader, read_settings, + global_context->getAsyncReadCounters(), + global_context->getFilesystemReadPrefetchesLog()); + } } } diff --git a/src/IO/SeekAvoidingReadBuffer.cpp b/src/IO/SeekAvoidingReadBuffer.cpp deleted file mode 100644 index 4d6406d8ddf..00000000000 --- a/src/IO/SeekAvoidingReadBuffer.cpp +++ /dev/null @@ -1,35 +0,0 @@ -#include - - -namespace DB -{ - -SeekAvoidingReadBuffer::SeekAvoidingReadBuffer(std::unique_ptr impl_, UInt64 min_bytes_for_seek_) - : ReadBufferFromFileDecorator(std::move(impl_)) - , min_bytes_for_seek(min_bytes_for_seek_) -{ -} - - -off_t SeekAvoidingReadBuffer::seek(off_t off, int whence) -{ - off_t position = getPosition(); - - if (whence == SEEK_CUR) - { - off += position; - whence = SEEK_SET; - } - - if (whence == SEEK_SET && off >= position && off < position + static_cast(min_bytes_for_seek)) - { - swap(*impl); - impl->ignore(off - position); - swap(*impl); - return off; - } - - return ReadBufferFromFileDecorator::seek(off, whence); -} - -} diff --git a/src/IO/SeekAvoidingReadBuffer.h b/src/IO/SeekAvoidingReadBuffer.h deleted file mode 100644 index 716d7c5046c..00000000000 --- a/src/IO/SeekAvoidingReadBuffer.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -#include - - -namespace DB -{ - -/// `SeekAvoidingReadBuffer` prefers sequential reads over seeks within specified window. -/// It is useful in network and spinning disk storage media when seek is relatively expensive -/// operation. -/// See also: `merge_tree_min_rows_for_seek`. -class SeekAvoidingReadBuffer : public ReadBufferFromFileDecorator -{ -public: - SeekAvoidingReadBuffer(std::unique_ptr impl_, UInt64 min_bytes_for_seek_); - - off_t seek(off_t off, int whence) override; - - void prefetch(Priority priority) override { impl->prefetch(priority); } - -private: - UInt64 min_bytes_for_seek; /// Minimum positive seek offset which shall be executed using seek operation. -}; - -} diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index c3ed0f1af16..8bab596901c 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -657,7 +657,7 @@ std::unique_ptr StorageS3Source::createAsyncS3ReadBuffer( std::move(read_buffer_creator), StoredObjects{StoredObject{key, object_size}}, read_settings, - /* cache_log */nullptr); + /* cache_log */nullptr, /* use_external_buffer */true); auto modified_settings{read_settings}; /// FIXME: Changing this setting to default value breaks something around parquet reading From 48e1b21aabd8ef642c1bcab7ba863f8c61123723 Mon Sep 17 00:00:00 2001 From: kevinyhzou Date: Thu, 8 Jun 2023 20:34:30 +0800 Subject: [PATCH 0660/1072] Add feature to support read csv by space & tab delimiter --- src/Core/Settings.h | 1 + src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + .../Formats/Impl/CSVRowInputFormat.cpp | 36 ++++++++++--------- ...h_whitespace_tab_field_delimiter.reference | 2 ++ ...ext_with_whitespace_tab_field_delimiter.sh | 18 ++++++++++ .../data_csv/csv_with_space_delimiter.csv | 1 + .../data_csv/csv_with_tab_delimiter.csv | 1 + 8 files changed, 45 insertions(+), 16 deletions(-) create mode 100644 tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.reference create mode 100755 tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.sh create mode 100644 tests/queries/0_stateless/data_csv/csv_with_space_delimiter.csv create mode 100644 tests/queries/0_stateless/data_csv/csv_with_tab_delimiter.csv diff --git a/src/Core/Settings.h b/src/Core/Settings.h index a87e321bed2..45641e76689 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -850,6 +850,7 @@ class IColumn; M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \ M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \ M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \ + M(Bool, input_format_csv_skip_whitespaces_tabs, true, "Skips spaces and tabs(\\t) characters in the CSV strings", 0) \ M(Bool, input_format_csv_trim_whitespaces, true, "Trims spaces and tabs (\\t) characters at the beginning and end in CSV strings", 0) \ M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \ M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 39b28e025a6..73a7d4f73f2 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -70,6 +70,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.skip_first_lines = settings.input_format_csv_skip_first_lines; format_settings.csv.try_detect_header = settings.input_format_csv_detect_header; format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces; + format_settings.csv.skip_whitespaces_tabs = settings.input_format_csv_skip_whitespaces_tabs; format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter; format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter; format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 6b4caffbf43..434389e31a1 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -137,6 +137,7 @@ struct FormatSettings String custom_delimiter; bool try_detect_header = true; bool trim_whitespaces = true; + bool skip_whitespaces_tabs = true; } csv; struct HiveText diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 8b4dbbffe1d..4094285e1ad 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -25,10 +25,10 @@ namespace ErrorCodes namespace { - void checkBadDelimiter(char delimiter) + void checkBadDelimiter(char delimiter, bool skip_whitespaces_tabs) { constexpr std::string_view bad_delimiters = " \t\"'.UL"; - if (bad_delimiters.find(delimiter) != std::string_view::npos) + if (bad_delimiters.find(delimiter) != std::string_view::npos && skip_whitespaces_tabs) throw Exception( ErrorCodes::BAD_ARGUMENTS, "CSV format may not work correctly with delimiter '{}'. Try use CustomSeparated format instead", @@ -68,7 +68,7 @@ CSVRowInputFormat::CSVRowInputFormat( format_settings_.csv.try_detect_header), buf(std::move(in_)) { - checkBadDelimiter(format_settings_.csv.delimiter); + checkBadDelimiter(format_settings_.csv.delimiter, format_settings_.csv.skip_whitespaces_tabs); } CSVRowInputFormat::CSVRowInputFormat( @@ -90,7 +90,7 @@ CSVRowInputFormat::CSVRowInputFormat( format_settings_.csv.try_detect_header), buf(std::move(in_)) { - checkBadDelimiter(format_settings_.csv.delimiter); + checkBadDelimiter(format_settings_.csv.delimiter, format_settings_.csv.skip_whitespaces_tabs); } void CSVRowInputFormat::syncAfterError() @@ -134,8 +134,12 @@ static void skipEndOfLine(ReadBuffer & in) } /// Skip `whitespace` symbols allowed in CSV. -static inline void skipWhitespacesAndTabs(ReadBuffer & in) +static inline void skipWhitespacesAndTabs(ReadBuffer & in, const bool & skip_whitespaces_tabs) { + if (!skip_whitespaces_tabs) + { + return; + } while (!in.eof() && (*in.position() == ' ' || *in.position() == '\t')) ++in.position(); } @@ -146,7 +150,7 @@ CSVFormatReader::CSVFormatReader(PeekableReadBuffer & buf_, const FormatSettings void CSVFormatReader::skipFieldDelimiter() { - skipWhitespacesAndTabs(*buf); + skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); assertChar(format_settings.csv.delimiter, *buf); } @@ -154,7 +158,7 @@ template String CSVFormatReader::readCSVFieldIntoString() { if (format_settings.csv.trim_whitespaces) [[likely]] - skipWhitespacesAndTabs(*buf); + skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); String field; if constexpr (read_string) @@ -166,14 +170,14 @@ String CSVFormatReader::readCSVFieldIntoString() void CSVFormatReader::skipField() { - skipWhitespacesAndTabs(*buf); + skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); NullOutput out; readCSVStringInto(out, *buf, format_settings.csv); } void CSVFormatReader::skipRowEndDelimiter() { - skipWhitespacesAndTabs(*buf); + skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); if (buf->eof()) return; @@ -182,7 +186,7 @@ void CSVFormatReader::skipRowEndDelimiter() if (*buf->position() == format_settings.csv.delimiter) ++buf->position(); - skipWhitespacesAndTabs(*buf); + skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); if (buf->eof()) return; @@ -194,7 +198,7 @@ void CSVFormatReader::skipHeaderRow() do { skipField(); - skipWhitespacesAndTabs(*buf); + skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); } while (checkChar(format_settings.csv.delimiter, *buf)); skipRowEndDelimiter(); @@ -207,7 +211,7 @@ std::vector CSVFormatReader::readRowImpl() do { fields.push_back(readCSVFieldIntoString()); - skipWhitespacesAndTabs(*buf); + skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); } while (checkChar(format_settings.csv.delimiter, *buf)); skipRowEndDelimiter(); @@ -220,7 +224,7 @@ bool CSVFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) try { - skipWhitespacesAndTabs(*buf); + skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); assertChar(delimiter, *buf); } catch (const DB::Exception &) @@ -246,7 +250,7 @@ bool CSVFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { - skipWhitespacesAndTabs(*buf); + skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); if (buf->eof()) return true; @@ -255,7 +259,7 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) if (*buf->position() == format_settings.csv.delimiter) { ++buf->position(); - skipWhitespacesAndTabs(*buf); + skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); if (buf->eof()) return true; } @@ -283,7 +287,7 @@ bool CSVFormatReader::readField( const String & /*column_name*/) { if (format_settings.csv.trim_whitespaces || !isStringOrFixedString(removeNullable(type))) [[likely]] - skipWhitespacesAndTabs(*buf); + skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); const bool at_delimiter = !buf->eof() && *buf->position() == format_settings.csv.delimiter; const bool at_last_column_line_end = is_last_file_column && (buf->eof() || *buf->position() == '\n' || *buf->position() == '\r'); diff --git a/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.reference b/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.reference new file mode 100644 index 00000000000..531391394a7 --- /dev/null +++ b/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.reference @@ -0,0 +1,2 @@ +1 a b +2 c d diff --git a/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.sh b/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.sh new file mode 100755 index 00000000000..19d343c352f --- /dev/null +++ b/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +# NOTE: this sh wrapper is required because of shell_config + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "drop table if exists test_whitespace" +$CLICKHOUSE_CLIENT -q "drop table if exists test_tab" +$CLICKHOUSE_CLIENT -q "create table test_whitespace (x UInt32, y String, z String) engine=MergeTree order by x" +$CLICKHOUSE_CLIENT -q "create table test_tab (x UInt32, y String, z String) engine=MergeTree order by x" +$CUR_DIR/data_csv/csv_with_space_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_whitespace SETTINGS format_csv_delimiter=' ', input_format_csv_skip_whitespaces_tabs=false FORMAT CSV" +$CUR_DIR/data_csv/csv_with_tab_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tab SETTINGS format_csv_delimiter='\t', input_format_csv_skip_whitespaces_tabs=false FORMAT CSV" +$CLICKHOUSE_CLIENT -q "select * from test_whitespace" +$CLICKHOUSE_CLIENT -q "select * from test_tab" +$CLICKHOUSE_CLIENT -q "drop table test_whitespace" +$CLICKHOUSE_CLIENT -q "drop table test_tab"; diff --git a/tests/queries/0_stateless/data_csv/csv_with_space_delimiter.csv b/tests/queries/0_stateless/data_csv/csv_with_space_delimiter.csv new file mode 100644 index 00000000000..967f8ae450e --- /dev/null +++ b/tests/queries/0_stateless/data_csv/csv_with_space_delimiter.csv @@ -0,0 +1 @@ +1 a b diff --git a/tests/queries/0_stateless/data_csv/csv_with_tab_delimiter.csv b/tests/queries/0_stateless/data_csv/csv_with_tab_delimiter.csv new file mode 100644 index 00000000000..f3b63950ea8 --- /dev/null +++ b/tests/queries/0_stateless/data_csv/csv_with_tab_delimiter.csv @@ -0,0 +1 @@ +2 c d From d35573a2063f2c6f3091001c238f8ba7ac96a44c Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Thu, 8 Jun 2023 14:40:58 +0200 Subject: [PATCH 0661/1072] Removed logs and small name fixes --- src/Common/ProfileEvents.cpp | 4 ++-- src/Core/Settings.h | 2 +- src/Storages/StorageS3.cpp | 3 --- src/TableFunctions/TableFunctionAzureBlobStorage.cpp | 2 -- src/TableFunctions/TableFunctionS3.cpp | 1 - 5 files changed, 3 insertions(+), 9 deletions(-) diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 3cc41c1972d..f66f7bc6465 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -348,8 +348,8 @@ The server successfully detected this situation and will download merged part fr M(S3PutObject, "Number of S3 API PutObject calls.") \ M(S3GetObject, "Number of S3 API GetObject calls.") \ \ - M(AzureDeleteObjects, "Number of S3 API DeleteObject(s) calls.") \ - M(AzureListObjects, "Number of S3 API ListObjects calls.") \ + M(AzureDeleteObjects, "Number of Azure blob storage API DeleteObject(s) calls.") \ + M(AzureListObjects, "Number of Azure blob storage API ListObjects calls.") \ \ M(DiskS3DeleteObjects, "Number of DiskS3 API DeleteObject(s) calls.") \ M(DiskS3CopyObject, "Number of DiskS3 API CopyObject calls.") \ diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 6abc2e1f697..25ad58e22fa 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -83,7 +83,7 @@ class IColumn; M(UInt64, s3_max_single_part_upload_size, 32*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \ M(UInt64, azure_max_single_part_upload_size, 100*1024*1024, "The maximum size of object to upload using singlepart upload to Azure blob storage.", 0) \ M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \ - M(UInt64, azure_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \ + M(UInt64, azure_max_single_read_retries, 4, "The maximum number of retries during single Azure blob storage read.", 0) \ M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \ M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \ M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \ diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 1c9efc31898..f1a7bcb71a2 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -957,9 +957,6 @@ StorageS3::StorageS3( {"_file", std::make_shared(std::make_shared())}}; auto columns = storage_metadata.getSampleBlock().getNamesAndTypesList(); - - LOG_INFO(&Poco::Logger::get("StorageS3"), "constructor columns = {}", columns.toString()); - virtual_columns = getVirtualsForStorage(columns, default_virtuals); for (const auto & column : virtual_columns) virtual_block.insert({column.type->createColumn(), column.type, column.name}); diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp index 265092ddefa..38d9362894a 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp @@ -2,7 +2,6 @@ #if USE_AZURE_BLOB_STORAGE -//#include #include #include #include @@ -15,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index e63f32b1cbc..c8cc0cddd30 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -319,7 +319,6 @@ StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, Context else if (!structure_hint.empty()) columns = structure_hint; - StoragePtr storage = std::make_shared( configuration, context, From 6fcc4158d2ed912e0dcf34ca6e8f00060ec47dc0 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 8 Jun 2023 12:46:49 +0000 Subject: [PATCH 0662/1072] Add SHOW INDICES as alias for statement SHOW INDEX/INDEXES/KEYS --- docs/en/sql-reference/statements/show.md | 2 +- src/Parsers/ParserShowIndexesQuery.cpp | 2 +- tests/queries/0_stateless/02724_show_indexes.reference | 5 +++++ tests/queries/0_stateless/02724_show_indexes.sql | 1 + 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/statements/show.md b/docs/en/sql-reference/statements/show.md index 21c0010498a..f96eb55aa45 100644 --- a/docs/en/sql-reference/statements/show.md +++ b/docs/en/sql-reference/statements/show.md @@ -273,7 +273,7 @@ SHOW DICTIONARIES FROM db LIKE '%reg%' LIMIT 2 Displays a list of primary and data skipping indexes of a table. ```sql -SHOW [EXTENDED] {INDEX | INDEXES | KEYS } {FROM | IN} [{FROM | IN} ] [WHERE ] [INTO OUTFILE ] [FORMAT ] +SHOW [EXTENDED] {INDEX | INDEXES | INDICES | KEYS } {FROM | IN}
[{FROM | IN} ] [WHERE ] [INTO OUTFILE ] [FORMAT ] ``` The database and table name can be specified in abbreviated form as `.
`, i.e. `FROM tab FROM db` and `FROM db.tab` are diff --git a/src/Parsers/ParserShowIndexesQuery.cpp b/src/Parsers/ParserShowIndexesQuery.cpp index fe08b463069..7852ed9806d 100644 --- a/src/Parsers/ParserShowIndexesQuery.cpp +++ b/src/Parsers/ParserShowIndexesQuery.cpp @@ -28,7 +28,7 @@ bool ParserShowIndexesQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe if (ParserKeyword("EXTENDED").ignore(pos, expected)) query->extended = true; - if (!(ParserKeyword("INDEX").ignore(pos, expected) || ParserKeyword("INDEXES").ignore(pos, expected) || ParserKeyword("KEYS").ignore(pos, expected))) + if (!(ParserKeyword("INDEX").ignore(pos, expected) || ParserKeyword("INDEXES").ignore(pos, expected) || ParserKeyword("INDICES").ignore(pos, expected) || ParserKeyword("KEYS").ignore(pos, expected))) return false; if (ParserKeyword("FROM").ignore(pos, expected) || ParserKeyword("IN").ignore(pos, expected)) diff --git a/tests/queries/0_stateless/02724_show_indexes.reference b/tests/queries/0_stateless/02724_show_indexes.reference index 20af3954fa5..8365ade3231 100644 --- a/tests/queries/0_stateless/02724_show_indexes.reference +++ b/tests/queries/0_stateless/02724_show_indexes.reference @@ -14,6 +14,11 @@ tbl 0 mm1_idx \N \N \N \N \N \N \N minmax \N \N YES a, c, d tbl 0 mm2_idx \N \N \N \N \N \N \N minmax \N \N YES c, d, e tbl 0 PRIMARY \N \N A \N \N \N \N primary \N \N YES c, a tbl 0 set_idx \N \N \N \N \N \N \N set \N \N YES e +tbl 0 blf_idx \N \N \N \N \N \N \N bloom_filter \N \N YES d, b +tbl 0 mm1_idx \N \N \N \N \N \N \N minmax \N \N YES a, c, d +tbl 0 mm2_idx \N \N \N \N \N \N \N minmax \N \N YES c, d, e +tbl 0 PRIMARY \N \N A \N \N \N \N primary \N \N YES c, a +tbl 0 set_idx \N \N \N \N \N \N \N set \N \N YES e --- EXTENDED tbl 0 blf_idx \N \N \N \N \N \N \N bloom_filter \N \N YES d, b tbl 0 mm1_idx \N \N \N \N \N \N \N minmax \N \N YES a, c, d diff --git a/tests/queries/0_stateless/02724_show_indexes.sql b/tests/queries/0_stateless/02724_show_indexes.sql index ce8ed67c524..04a481fea4e 100644 --- a/tests/queries/0_stateless/02724_show_indexes.sql +++ b/tests/queries/0_stateless/02724_show_indexes.sql @@ -22,6 +22,7 @@ PRIMARY KEY (c, a); SELECT '--- Aliases of SHOW INDEX'; SHOW INDEX FROM tbl; SHOW INDEXES FROM tbl; +SHOW INDICES FROM tbl; SHOW KEYS FROM tbl; SELECT '--- EXTENDED'; From f4202963ad04f52da3aa0ada96c6151cfadd4a69 Mon Sep 17 00:00:00 2001 From: kevinyhzou Date: Thu, 8 Jun 2023 21:06:38 +0800 Subject: [PATCH 0663/1072] test modify --- .../02785_text_with_whitespace_tab_field_delimiter.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.sh b/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.sh index 19d343c352f..e3f61262674 100755 --- a/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.sh +++ b/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.sh @@ -10,8 +10,8 @@ $CLICKHOUSE_CLIENT -q "drop table if exists test_whitespace" $CLICKHOUSE_CLIENT -q "drop table if exists test_tab" $CLICKHOUSE_CLIENT -q "create table test_whitespace (x UInt32, y String, z String) engine=MergeTree order by x" $CLICKHOUSE_CLIENT -q "create table test_tab (x UInt32, y String, z String) engine=MergeTree order by x" -$CUR_DIR/data_csv/csv_with_space_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_whitespace SETTINGS format_csv_delimiter=' ', input_format_csv_skip_whitespaces_tabs=false FORMAT CSV" -$CUR_DIR/data_csv/csv_with_tab_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tab SETTINGS format_csv_delimiter='\t', input_format_csv_skip_whitespaces_tabs=false FORMAT CSV" +cat $CURDIR/data_csv/csv_with_space_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_whitespace SETTINGS format_csv_delimiter=' ', input_format_csv_skip_whitespaces_tabs=false FORMAT CSV" +cat $CURDIR/data_csv/csv_with_tab_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tab SETTINGS format_csv_delimiter='\t', input_format_csv_skip_whitespaces_tabs=false FORMAT CSV" $CLICKHOUSE_CLIENT -q "select * from test_whitespace" $CLICKHOUSE_CLIENT -q "select * from test_tab" $CLICKHOUSE_CLIENT -q "drop table test_whitespace" From 5344ff2516d8f135c2a65f3979bf18109d76a10b Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 8 Jun 2023 15:13:12 +0200 Subject: [PATCH 0664/1072] Temporarily disable annoy index tests (flaky for analyzer) (#50714) --- tests/queries/0_stateless/02354_annoy_index.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02354_annoy_index.sql b/tests/queries/0_stateless/02354_annoy_index.sql index 0168fa04c6f..abee5e8a6e4 100644 --- a/tests/queries/0_stateless/02354_annoy_index.sql +++ b/tests/queries/0_stateless/02354_annoy_index.sql @@ -1,4 +1,4 @@ --- Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-upgrade-check +-- Tags: disabled, no-fasttest, no-ubsan, no-cpu-aarch64, no-upgrade-check SET allow_experimental_annoy_index = 1; From a918f88c744aa1fe171b500e3b56bfe6a3e13a2c Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 8 Jun 2023 16:11:27 +0200 Subject: [PATCH 0665/1072] Fixes --- src/Storages/StorageAzureBlob.cpp | 18 ++++++++++++++---- src/Storages/StorageAzureBlob.h | 12 ++---------- .../configs/disable_profilers.xml | 9 +++++++++ .../test_storage_azure_blob_storage/test.py | 6 ++++-- 4 files changed, 29 insertions(+), 16 deletions(-) create mode 100644 tests/integration/test_storage_azure_blob_storage/configs/disable_profilers.xml diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 4901f6701fb..ce6c864f7ea 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -122,7 +122,7 @@ void StorageAzureBlob::processNamedCollectionResult(StorageAzureBlob::Configurat } -StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file) +StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine_args, ContextPtr local_context) { StorageAzureBlob::Configuration configuration; @@ -137,7 +137,7 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine configuration.blobs_paths = {configuration.blob_path}; - if (configuration.format == "auto" && get_format_from_file) + if (configuration.format == "auto") configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); return configuration; @@ -230,7 +230,7 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine configuration.blobs_paths = {configuration.blob_path}; - if (configuration.format == "auto" && get_format_from_file) + if (configuration.format == "auto") configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); return configuration; @@ -898,7 +898,13 @@ RelativePathWithMetadata StorageAzureBlobSource::Iterator::next() } else { - if (!blobs_with_metadata || index >= blobs_with_metadata->size()) + bool need_new_batch = false; + { + std::lock_guard lock(next_mutex); + need_new_batch = !blobs_with_metadata || index >= blobs_with_metadata->size(); + } + + if (need_new_batch) { RelativePathsWithMetadata new_batch; while (new_batch.empty()) @@ -939,6 +945,7 @@ RelativePathWithMetadata StorageAzureBlobSource::Iterator::next() VirtualColumnUtils::filterBlockWithQuery(query, block, getContext(), filter_ast); const auto & idxs = typeid_cast(*block.getByName("_idx").column); + std::lock_guard lock(next_mutex); blob_path_with_globs.reset(); blob_path_with_globs.emplace(); for (UInt64 idx : idxs.getData()) @@ -954,6 +961,7 @@ RelativePathWithMetadata StorageAzureBlobSource::Iterator::next() if (outer_blobs) outer_blobs->insert(outer_blobs->end(), new_batch.begin(), new_batch.end()); + std::lock_guard lock(next_mutex); blobs_with_metadata = std::move(new_batch); for (const auto & [_, info] : *blobs_with_metadata) total_size.fetch_add(info.size_bytes, std::memory_order_relaxed); @@ -961,6 +969,8 @@ RelativePathWithMetadata StorageAzureBlobSource::Iterator::next() } size_t current_index = index.fetch_add(1, std::memory_order_relaxed); + + std::lock_guard lock(next_mutex); return (*blobs_with_metadata)[current_index]; } } diff --git a/src/Storages/StorageAzureBlob.h b/src/Storages/StorageAzureBlob.h index 6e4dfaf19eb..c0380d7c065 100644 --- a/src/Storages/StorageAzureBlob.h +++ b/src/Storages/StorageAzureBlob.h @@ -15,15 +15,6 @@ namespace DB { -struct AzureSimpleAccountConfiguration -{ - std::string storage_account_url; -}; - -using AzureConnectionString = std::string; - -using AzureCredentials = std::variant; - class StorageAzureBlob : public IStorage { public: @@ -73,7 +64,7 @@ public: std::optional format_settings_, ASTPtr partition_by_); - static StorageAzureBlob::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file = true); + static StorageAzureBlob::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context); static AzureClientPtr createClient(StorageAzureBlob::Configuration configuration); static AzureObjectStorage::SettingsPtr createSettings(ContextPtr local_context); @@ -190,6 +181,7 @@ public: void createFilterAST(const String & any_key); bool is_finished = false; bool is_initialized = false; + std::mutex next_mutex; }; StorageAzureBlobSource( diff --git a/tests/integration/test_storage_azure_blob_storage/configs/disable_profilers.xml b/tests/integration/test_storage_azure_blob_storage/configs/disable_profilers.xml new file mode 100644 index 00000000000..a39badbf8ec --- /dev/null +++ b/tests/integration/test_storage_azure_blob_storage/configs/disable_profilers.xml @@ -0,0 +1,9 @@ + + + + + 0 + 0 + + + diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index de7c662127c..f0934d3aa80 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -25,6 +25,7 @@ def cluster(): cluster.add_instance( "node", main_configs=["configs/named_collections.xml"], + user_configs=["configs/disable_profilers.xml"], with_azurite=True, ) cluster.start() @@ -34,13 +35,14 @@ def cluster(): cluster.shutdown() -def azure_query(node, query, try_num=3, settings={}): +def azure_query(node, query, try_num=10, settings={}): for i in range(try_num): try: return node.query(query, settings=settings) except Exception as ex: retriable_errors = [ - "DB::Exception: Azure::Core::Http::TransportException: Connection was closed by the server while trying to read a response" + "DB::Exception: Azure::Core::Http::TransportException: Connection was closed by the server while trying to read a response", + "DB::Exception: Azure::Core::Http::TransportException: Connection closed before getting full response or response is less than expected", ] retry = False for error in retriable_errors: From 964e0d042ca4fb10629da2437eae5efa9559955f Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 8 Jun 2023 16:21:45 +0200 Subject: [PATCH 0666/1072] Fix --- src/Interpreters/Cache/FileCache.cpp | 9 +++++---- src/Interpreters/Cache/Metadata.cpp | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp index 8d6146b4eea..cd0f96d8e0e 100644 --- a/src/Interpreters/Cache/FileCache.cpp +++ b/src/Interpreters/Cache/FileCache.cpp @@ -988,17 +988,18 @@ void FileCache::cleanup() void FileCache::cleanupThreadFunc() { -#ifndef NDEBUG - assertCacheCorrectness(); -#endif - try { +#ifdef ABORT_ON_LOGICAL_ERROR + assertCacheCorrectness(); +#endif + cleanup(); } catch (...) { tryLogCurrentException(__PRETTY_FUNCTION__); + chassert(false); } cleanup_task->scheduleAfter(delayed_cleanup_interval_ms); diff --git a/src/Interpreters/Cache/Metadata.cpp b/src/Interpreters/Cache/Metadata.cpp index 5b6561a665e..cd3b4e4f09f 100644 --- a/src/Interpreters/Cache/Metadata.cpp +++ b/src/Interpreters/Cache/Metadata.cpp @@ -278,7 +278,7 @@ void CacheMetadata::doCleanup() try { if (fs::exists(key_prefix_directory) && fs::is_empty(key_prefix_directory)) - fs::remove_all(key_prefix_directory); + fs::remove(key_prefix_directory); } catch (const fs::filesystem_error & e) { From 9f61c786ed67ae571384f90ef934bb839ab20e0a Mon Sep 17 00:00:00 2001 From: Victor Krasnov Date: Thu, 8 Jun 2023 17:25:53 +0300 Subject: [PATCH 0667/1072] Amend the tests --- ...3_parseDateTimeBestEffort_syslog.reference | 21 +--- .../02783_parseDateTimeBestEffort_syslog.sql | 97 ++++++------------- 2 files changed, 34 insertions(+), 84 deletions(-) diff --git a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference index 7409b413260..63e7e367941 100644 --- a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference +++ b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference @@ -1,20 +1,5 @@ parseDateTimeBestEffort - dt_ref res res_sam res_auc res_null res_null_sam res_null_auc res_zero res_zero_sam res_zero_auc + dt_ref res res_sam res_auc res_null res_null_sam res_null_auc res_zero res_zero_sam res_zero_auc res_us res_us_sam res_us_auc res_us_null res_us_null_sam res_us_null_auc res_us_zero res_us_zero_sam res_us_zero_auc res64 res64_sam res64_auc res64_null res64_null_sam res64_null_auc res64_zero res64_zero_sam res64_zero_auc res64_us res64_us_sam res64_us_auc res64_us_null res64_us_null_sam res64_us_null_auc res64_us_zero res64_us_zero_sam res64_us_zero_auc - Jun 7 04:55:00 2023-06-07 04:55:00 2022-06-07 04:55:00 2023-06-07 04:55:00 2023-06-07 04:55:00 2022-06-07 04:55:00 2023-06-07 04:55:00 2023-06-07 04:55:00 2022-06-07 04:55:00 2023-06-07 04:55:00 - Jun 7 04:56:00 2022-06-07 04:56:00 2022-06-07 04:56:00 2023-06-07 04:56:00 2022-06-07 04:56:00 2022-06-07 04:56:00 2023-06-07 04:56:00 2022-06-07 04:56:00 2022-06-07 04:56:00 2023-06-07 04:56:00 -parseDateTimeBestEffortUS - dt_ref res res_sam res_auc res_null res_null_sam res_null_auc res_zero res_zero_sam res_zero_auc - - Jun 7 04:55:00 2023-06-07 04:55:00 2022-06-07 04:55:00 2023-06-07 04:55:00 2023-06-07 04:55:00 2022-06-07 04:55:00 2023-06-07 04:55:00 2023-06-07 04:55:00 2022-06-07 04:55:00 2023-06-07 04:55:00 - Jun 7 04:56:00 2022-06-07 04:56:00 2022-06-07 04:56:00 2023-06-07 04:56:00 2022-06-07 04:56:00 2022-06-07 04:56:00 2023-06-07 04:56:00 2022-06-07 04:56:00 2022-06-07 04:56:00 2023-06-07 04:56:00 -parseDateTime64BestEffort - dt_ref res res_sam res_auc res_null res_null_sam res_null_auc res_zero res_zero_sam res_zero_auc - - Jun 7 04:55:00 2023-06-07 04:55:00.000 2022-06-07 04:55:00.000 2023-06-07 04:55:00.000 2023-06-07 04:55:00.000 2022-06-07 04:55:00.000 2023-06-07 04:55:00.000 2023-06-07 04:55:00.000 2022-06-07 04:55:00.000 2023-06-07 04:55:00.000 - Jun 7 04:56:00 2022-06-07 04:56:00.000 2022-06-07 04:56:00.000 2023-06-07 04:56:00.000 2022-06-07 04:56:00.000 2022-06-07 04:56:00.000 2023-06-07 04:56:00.000 2022-06-07 04:56:00.000 2022-06-07 04:56:00.000 2023-06-07 04:56:00.000 -parseDateTime64BestEffortUS - dt_ref res res_sam res_auc res_null res_null_sam res_null_auc res_zero res_zero_sam res_zero_auc - - Jun 7 04:55:00 2023-06-07 04:55:00.000 2022-06-07 04:55:00.000 2023-06-07 04:55:00.000 2023-06-07 04:55:00.000 2022-06-07 04:55:00.000 2023-06-07 04:55:00.000 2023-06-07 04:55:00.000 2022-06-07 04:55:00.000 2023-06-07 04:55:00.000 - Jun 7 04:56:00 2022-06-07 04:56:00.000 2022-06-07 04:56:00.000 2023-06-07 04:56:00.000 2022-06-07 04:56:00.000 2022-06-07 04:56:00.000 2023-06-07 04:56:00.000 2022-06-07 04:56:00.000 2022-06-07 04:56:00.000 2023-06-07 04:56:00.000 + Jun 6 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 + Jun 8 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 diff --git a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql index 91ae230205b..59211d3e6a0 100644 --- a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql +++ b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql @@ -1,8 +1,9 @@ SELECT 'parseDateTimeBestEffort'; WITH + 86400 AS secs_in_day, now() AS ts_now, - '2023-06-07 04:55:30' AS ref_point, + '2023-06-07' AS ref_point, dateDiff('second', toDateTime(ref_point), ts_now) AS impedimenta, formatDateTime(ts_around, '%b %e %T') AS dt_curr SELECT @@ -15,69 +16,33 @@ SELECT parseDateTimeBestEffortOrNull(dt_curr, 'Pacific/Auckland') - impedimenta AS res_null_auc, parseDateTimeBestEffortOrZero(dt_curr) - impedimenta AS res_zero, parseDateTimeBestEffortOrZero(dt_curr, 'US/Samoa') - impedimenta AS res_zero_sam, - parseDateTimeBestEffortOrZero(dt_curr, 'Pacific/Auckland') - impedimenta AS res_zero_auc -FROM (SELECT arrayJoin([ts_now - 30, ts_now + 30]) AS ts_around) -FORMAT PrettySpaceNoEscapes; - -SELECT 'parseDateTimeBestEffortUS'; - -WITH - now() AS ts_now, - '2023-06-07 04:55:30' AS ref_point, - dateDiff('second', toDateTime(ref_point), ts_now) AS impedimenta, - formatDateTime(ts_around, '%b %e %T') AS dt_curr -SELECT - formatDateTime(ts_around - impedimenta, '%b %e %H:%i:%s') AS dt_ref, - parseDateTimeBestEffortUS(dt_curr) - impedimenta AS res, - parseDateTimeBestEffortUS(dt_curr, 'US/Samoa') - impedimenta AS res_sam, - parseDateTimeBestEffortUS(dt_curr, 'Pacific/Auckland') - impedimenta AS res_auc, - parseDateTimeBestEffortUSOrNull(dt_curr) - impedimenta AS res_null, - parseDateTimeBestEffortUSOrNull(dt_curr, 'US/Samoa') - impedimenta AS res_null_sam, - parseDateTimeBestEffortUSOrNull(dt_curr, 'Pacific/Auckland') - impedimenta AS res_null_auc, - parseDateTimeBestEffortUSOrZero(dt_curr) - impedimenta AS res_zero, - parseDateTimeBestEffortUSOrZero(dt_curr, 'US/Samoa') - impedimenta AS res_zero_sam, - parseDateTimeBestEffortUSOrZero(dt_curr, 'Pacific/Auckland') - impedimenta AS res_zero_auc -FROM (SELECT arrayJoin([ts_now - 30, ts_now + 30]) AS ts_around) -FORMAT PrettySpaceNoEscapes; - -SELECT 'parseDateTime64BestEffort'; - -WITH - now() AS ts_now, - '2023-06-07 04:55:30' AS ref_point, - dateDiff('second', toDateTime(ref_point), ts_now) AS impedimenta, - formatDateTime(ts_around, '%b %e %T') AS dt_curr -SELECT - formatDateTime(ts_around - impedimenta, '%b %e %H:%i:%s') AS dt_ref, - parseDateTime64BestEffort(dt_curr) - impedimenta AS res, - parseDateTime64BestEffort(dt_curr, 3, 'US/Samoa') - impedimenta AS res_sam, - parseDateTime64BestEffort(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res_auc, - parseDateTime64BestEffortOrNull(dt_curr) - impedimenta AS res_null, - parseDateTime64BestEffortOrNull(dt_curr, 3, 'US/Samoa') - impedimenta AS res_null_sam, - parseDateTime64BestEffortOrNull(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res_null_auc, - parseDateTime64BestEffortOrZero(dt_curr) - impedimenta AS res_zero, - parseDateTime64BestEffortOrZero(dt_curr, 3, 'US/Samoa') - impedimenta AS res_zero_sam, - parseDateTime64BestEffortOrZero(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res_zero_auc -FROM (SELECT arrayJoin([ts_now - 30, ts_now + 30]) AS ts_around) -FORMAT PrettySpaceNoEscapes; - -SELECT 'parseDateTime64BestEffortUS'; - -WITH - now() AS ts_now, - '2023-06-07 04:55:30' AS ref_point, - dateDiff('second', toDateTime(ref_point), ts_now) AS impedimenta, - formatDateTime(ts_around, '%b %e %T') AS dt_curr -SELECT - formatDateTime(ts_around - impedimenta, '%b %e %H:%i:%s') AS dt_ref, - parseDateTime64BestEffortUS(dt_curr) - impedimenta AS res, - parseDateTime64BestEffortUS(dt_curr, 3, 'US/Samoa') - impedimenta AS res_sam, - parseDateTime64BestEffortUS(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res_auc, - parseDateTime64BestEffortUSOrNull(dt_curr) - impedimenta AS res_null, - parseDateTime64BestEffortUSOrNull(dt_curr, 3, 'US/Samoa') - impedimenta AS res_null_sam, - parseDateTime64BestEffortUSOrNull(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res_null_auc, - parseDateTime64BestEffortUSOrZero(dt_curr) - impedimenta AS res_zero, - parseDateTime64BestEffortUSOrZero(dt_curr, 3, 'US/Samoa') - impedimenta AS res_zero_sam, - parseDateTime64BestEffortUSOrZero(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res_zero_auc -FROM (SELECT arrayJoin([ts_now - 30, ts_now + 30]) AS ts_around) + parseDateTimeBestEffortOrZero(dt_curr, 'Pacific/Auckland') - impedimenta AS res_zero_auc, + parseDateTimeBestEffortUS(dt_curr) - impedimenta AS res_us, + parseDateTimeBestEffortUS(dt_curr, 'US/Samoa') - impedimenta AS res_us_sam, + parseDateTimeBestEffortUS(dt_curr, 'Pacific/Auckland') - impedimenta AS res_us_auc, + parseDateTimeBestEffortUSOrNull(dt_curr) - impedimenta AS res_us_null, + parseDateTimeBestEffortUSOrNull(dt_curr, 'US/Samoa') - impedimenta AS res_us_null_sam, + parseDateTimeBestEffortUSOrNull(dt_curr, 'Pacific/Auckland') - impedimenta AS res_us_null_auc, + parseDateTimeBestEffortUSOrZero(dt_curr) - impedimenta AS res_us_zero, + parseDateTimeBestEffortUSOrZero(dt_curr, 'US/Samoa') - impedimenta AS res_us_zero_sam, + parseDateTimeBestEffortUSOrZero(dt_curr, 'Pacific/Auckland') - impedimenta AS res_us_zero_auc, + parseDateTime64BestEffort(dt_curr) - impedimenta AS res64, + parseDateTime64BestEffort(dt_curr, 3, 'US/Samoa') - impedimenta AS res64_sam, + parseDateTime64BestEffort(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res64_auc, + parseDateTime64BestEffortOrNull(dt_curr) - impedimenta AS res64_null, + parseDateTime64BestEffortOrNull(dt_curr, 3, 'US/Samoa') - impedimenta AS res64_null_sam, + parseDateTime64BestEffortOrNull(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res64_null_auc, + parseDateTime64BestEffortOrZero(dt_curr) - impedimenta AS res64_zero, + parseDateTime64BestEffortOrZero(dt_curr, 3, 'US/Samoa') - impedimenta AS res64_zero_sam, + parseDateTime64BestEffortOrZero(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res64_zero_auc, + parseDateTime64BestEffortUS(dt_curr) - impedimenta AS res64_us, + parseDateTime64BestEffortUS(dt_curr, 3, 'US/Samoa') - impedimenta AS res64_us_sam, + parseDateTime64BestEffortUS(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res64_us_auc, + parseDateTime64BestEffortUSOrNull(dt_curr) - impedimenta AS res64_us_null, + parseDateTime64BestEffortUSOrNull(dt_curr, 3, 'US/Samoa') - impedimenta AS res64_us_null_sam, + parseDateTime64BestEffortUSOrNull(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res64_us_null_auc, + parseDateTime64BestEffortUSOrZero(dt_curr) - impedimenta AS res64_us_zero, + parseDateTime64BestEffortUSOrZero(dt_curr, 3, 'US/Samoa') - impedimenta AS res64_us_zero_sam, + parseDateTime64BestEffortUSOrZero(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res64_us_zero_auc +FROM (SELECT arrayJoin([ts_now - secs_in_day, ts_now + secs_in_day]) AS ts_around) FORMAT PrettySpaceNoEscapes; From caabbfd5b16249652fc247e37fe1b2318c9c9994 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 8 Jun 2023 16:25:55 +0200 Subject: [PATCH 0668/1072] Fix one more race --- src/Storages/StorageAzureBlob.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/StorageAzureBlob.h b/src/Storages/StorageAzureBlob.h index c0380d7c065..1f91e47ddbe 100644 --- a/src/Storages/StorageAzureBlob.h +++ b/src/Storages/StorageAzureBlob.h @@ -179,8 +179,8 @@ public: std::unique_ptr matcher; void createFilterAST(const String & any_key); - bool is_finished = false; - bool is_initialized = false; + std::atomic is_finished = false; + std::atomic is_initialized = false; std::mutex next_mutex; }; From 9b04e8513700996d3c9e558c910913ab96016af5 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 8 Jun 2023 16:43:27 +0200 Subject: [PATCH 0669/1072] Fix --- src/Interpreters/tests/gtest_lru_file_cache.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Interpreters/tests/gtest_lru_file_cache.cpp b/src/Interpreters/tests/gtest_lru_file_cache.cpp index fe9e3a18024..8c8e715ce92 100644 --- a/src/Interpreters/tests/gtest_lru_file_cache.cpp +++ b/src/Interpreters/tests/gtest_lru_file_cache.cpp @@ -467,6 +467,7 @@ TEST_F(FileCacheTest, get) cv.notify_one(); file_segment2.wait(file_segment2.range().left); + file_segment2.complete(); ASSERT_TRUE(file_segment2.state() == State::DOWNLOADED); }); From 116df09b5df6dd7362b36c9eea6f08b16a3e1b2b Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 8 Jun 2023 17:06:15 +0200 Subject: [PATCH 0670/1072] Fix build --- src/Storages/StorageAzureBlob.cpp | 2 +- src/Storages/StorageAzureBlob.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index ce6c864f7ea..3ee176a68b7 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -669,7 +669,7 @@ Pipe StorageAzureBlob::read( return Pipe::unitePipes(std::move(pipes)); } -SinkToStoragePtr StorageAzureBlob::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) +SinkToStoragePtr StorageAzureBlob::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) { auto sample_block = metadata_snapshot->getSampleBlock(); auto chosen_compression_method = chooseCompressionMethod(configuration.blobs_paths.back(), configuration.compression_method); diff --git a/src/Storages/StorageAzureBlob.h b/src/Storages/StorageAzureBlob.h index 1f91e47ddbe..e2001fa24ae 100644 --- a/src/Storages/StorageAzureBlob.h +++ b/src/Storages/StorageAzureBlob.h @@ -85,7 +85,7 @@ public: size_t, size_t) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /* metadata_snapshot */, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /* metadata_snapshot */, ContextPtr context, bool /*async_insert*/) override; void truncate(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, TableExclusiveLockHolder &) override; From b8178088d020d956cdf27d390f3d3b7a72a813e8 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 8 Jun 2023 08:10:40 +0000 Subject: [PATCH 0671/1072] Misc Annoy fixes --- .../mergetree-family/annindexes.md | 169 +++++--- .../mergetree-family/mergetree.md | 2 +- src/Parsers/ParserCreateIndexQuery.cpp | 11 +- src/Parsers/ParserCreateQuery.cpp | 12 +- ...pproximateNearestNeighborIndexesCommon.cpp | 2 +- .../ApproximateNearestNeighborIndexesCommon.h | 4 +- .../MergeTree/MergeTreeIndexAnnoy.cpp | 43 +- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 16 +- .../0_stateless/02354_annoy_index.reference | 260 ++++++------ .../queries/0_stateless/02354_annoy_index.sql | 371 +++++++++++------- .../aspell-ignore/en/aspell-dict.txt | 3 + 11 files changed, 548 insertions(+), 345 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index f600f9a015c..2b0b77a0735 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -1,104 +1,142 @@ # Approximate Nearest Neighbor Search Indexes [experimental] {#table_engines-ANNIndex} -Nearest neighborhood search refers to the problem of finding the point(s) with the smallest distance to a given point in an n-dimensional -space. Since exact search is in practice usually typically too slow, the task is often solved with approximate algorithms. A popular use -case of of neighbor search is finding similar pictures (texts) for a given picture (text). Pictures (texts) can be decomposed into -[embeddings](https://cloud.google.com/architecture/overview-extracting-and-serving-feature-embeddings-for-machine-learning), and instead of -comparing pictures (texts) pixel-by-pixel (character-by-character), only the embeddings are compared. +Nearest neighborhood search is the problem of finding the M closest points for a given point in an N-dimensional vector space. The most +straightforward approach to solve this problem is a brute force search where the distance between all points in the vector space and the +reference point is computed. This method guarantees perfect accuracy but it is usually too slow for practical applications. Thus, nearest +neighborhood search problems are often solved with [approximative algorithms](https://github.com/erikbern/ann-benchmarks). Approximative +nearest neighborhood search techniques, in conjunction with [embedding +methods](https://cloud.google.com/architecture/overview-extracting-and-serving-feature-embeddings-for-machine-learning) allow to search huge +amounts of media (pictures, songs, articles, etc.) in milliseconds. -In terms of SQL, the problem can be expressed as follows: +Blogs: +- [Vector Search with ClickHouse - Part 1](https://clickhouse.com/blog/vector-search-clickhouse-p1) +- [Vector Search with ClickHouse - Part 2](https://clickhouse.com/blog/vector-search-clickhouse-p2) + + +In terms of SQL, the nearest neighborhood problem can be expressed as follows: ``` sql SELECT * FROM table -WHERE L2Distance(column, Point) < MaxDistance +ORDER BY Distance(vectors, Point) LIMIT N ``` +`vectors` contains N-dimensional values of type [Array](../../../sql-reference/data-types/array.md) or +[Tuple](../../../sql-reference/data-types/tuple.md), for example embeddings. Function `Distance` computes the distance between two vectors. +Often, the the Euclidean (L2) distance is chosen as distance function but [other +distance functions](/docs/en/sql-reference/functions/distance-functions.md) are also possible. `Point` is the reference point, e.g. `(0.17, +0.33, ...)`, and `N` limits the number of search results. + +An alternative formulation of the nearest neighborhood search problem looks as follows: + ``` sql SELECT * FROM table -ORDER BY L2Distance(column, Point) +WHERE Distance(vectors, Point) < MaxDistance LIMIT N ``` -The queries are expensive because the L2 (Euclidean) distance between `Point` and all points in `column` and must be computed. To speed this process up, Approximate Nearest Neighbor Search Indexes (ANN indexes) store a compact representation of the search space (using clustering, search trees, etc.) which allows to compute an approximate answer quickly. +While the first query returns the top-`N` closest points to the reference point, the second query returns all points closer to the reference +point than a maximally allowed radius `MaxDistance`. Parameter `N` limits the number of returned values which is useful for situations where +`MaxDistance` is difficult to determine in advance. -# Creating ANN Indexes +With brute force search, both queries are expensive (linear in the number of points) because the distance between all points in `vectors` and +`Point` must be computed. To speed this process up, Approximate Nearest Neighbor Search Indexes (ANN indexes) store a compact representation +of the search space (using clustering, search trees, etc.) which allows to compute an approximate answer much quicker (in sub-linear time). -As long as ANN indexes are experimental, you first need to `SET allow_experimental_annoy_index = 1`. +# Creating and Using ANN Indexes -Syntax to create an ANN index over an `Array` column: +Syntax to create an ANN index over an [Array](../../../sql-reference/data-types/array.md) column: ```sql CREATE TABLE table ( `id` Int64, - `embedding` Array(Float32), - INDEX embedding TYPE () GRANULARITY + `vectors` Array(Float32), + INDEX vectors TYPE () [GRANULARITY ] ) ENGINE = MergeTree ORDER BY id; ``` -Syntax to create an ANN index over a `Tuple` column: +Syntax to create an ANN index over a [Tuple](../../../sql-reference/data-types/tuple.md) column: ```sql CREATE TABLE table ( `id` Int64, - `embedding` Tuple(Float32[, Float32[, ...]]), - INDEX embedding TYPE () GRANULARITY + `vectors` Tuple(Float32[, Float32[, ...]]), + INDEX vectors TYPE () [GRANULARITY ] ) ENGINE = MergeTree ORDER BY id; ``` -ANN indexes are built during column insertion and merge and `INSERT` and `OPTIMIZE` statements will be slower than for ordinary tables. ANNIndexes are ideally used only with immutable or rarely changed data, respectively there are much more read requests than write requests. - -Similar to regular skip indexes, ANN indexes are constructed over granules and each indexed block consists of `GRANULARITY = `-many -granules. For example, if the primary index granularity of the table is 8192 (setting `index_granularity = 8192`) and `GRANULARITY = 2`, -then each indexed block will consist of 16384 rows. However, unlike skip indexes, ANN indexes are not only able to skip the entire indexed -block, they are able to skip individual granules in indexed blocks. As a result, the `GRANULARITY` parameter has a different meaning in ANN -indexes than in normal skip indexes. Basically, the bigger `GRANULARITY` is chosen, the more data is provided to a single ANN index, and the -higher the chance that with the right hyper parameters, the index will remember the data structure better. - -# Using ANN Indexes +ANN indexes are built during column insertion and merge. As a result, `INSERT` and `OPTIMIZE` statements will be slower than for ordinary +tables. ANNIndexes are ideally used only with immutable or rarely changed data, respectively when are far more read requests than write +requests. ANN indexes support two types of queries: -- WHERE queries: - - ``` sql - SELECT * - FROM table - WHERE DistanceFunction(column, Point) < MaxDistance - LIMIT N - ``` - - ORDER BY queries: ``` sql SELECT * FROM table [WHERE ...] - ORDER BY DistanceFunction(column, Point) + ORDER BY Distance(vectors, Point) LIMIT N ``` -`DistanceFunction` is a [distance function](/docs/en/sql-reference/functions/distance-functions.md), `Point` is a reference vector (e.g. `(0.17, 0.33, ...)`) and `MaxDistance` is a floating point value which restricts the size of the neighbourhood. +- WHERE queries: + + ``` sql + SELECT * + FROM table + WHERE Distance(vectors, Point) < MaxDistance + LIMIT N + ``` :::tip -To avoid writing out large vectors, you can use [query parameters](/docs/en//interfaces/cli.md#queries-with-parameters-cli-queries-with-parameters), e.g. +To avoid writing out large vectors, you can use [query +parameters](/docs/en//interfaces/cli.md#queries-with-parameters-cli-queries-with-parameters), e.g. ```bash -clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Distance(embedding, {vec: Array(Float32)}) < 1.0" +clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Distance(vectors, {vec: Array(Float32)}) < 1.0" ``` ::: -ANN indexes cannot speed up queries that contain both a `WHERE DistanceFunction(column, Point) < MaxDistance` and an `ORDER BY DistanceFunction(column, Point)` clause. Also, the approximate algorithms used to determine the nearest neighbors require a limit, hence queries that use an ANN index must have a `LIMIT` clause. +**Restrictions**: Queries that contain both a `WHERE Distance(vectors, Point) < MaxDistance` and an `ORDER BY Distance(vectors, Point)` +clause cannot use ANN indexes. Also, the approximate algorithms used to determine the nearest neighbors require a limit, hence queries +without `LIMIT` clause cannot utilize ANN indexes. Also ANN indexes are only used if the query has a `LIMIT` value smaller than setting +`max_limit_for_ann_queries` (default: 1 million rows). This is a safeguard to prevent large memory allocations by external libraries for +approximate neighbor search. + +**Differences to Skip Indexes** Similar to regular [skip indexes](https://clickhouse.com/docs/en/optimize/skipping-indexes), ANN indexes are +constructed over granules and each indexed block consists of `GRANULARITY = `-many granules (`` = 1 by default for normal skip +indexes). For example, if the primary index granularity of the table is 8192 (setting `index_granularity = 8192`) and `GRANULARITY = 2`, +then each indexed block will contain 16384 rows. However, data structures and algorithms for approximate neighborhood search (usually +provided by external libraries) are inherently row-oriented. They store a compact representation of a set of rows and also return rows for +ANN queries. This causes some rather unintuitive differences in the way ANN indexes behave compared to normal skip indexes. + +When a user defines a ANN index on a column, ClickHouse internally creates a ANN "sub-index" for each index block. The sub-index is "local" +in the sense that it only knows about the rows of its containing index block. In the previous example and assuming that a column has 65536 +rows, we obtain four index blocks (spanning eight granules) and a ANN sub-index for each index block. A sub-index is theoretically able to +return the rows with the N closest points within its index block directly. However, since ClickHouse loads data from disk to memory at the +granularity of granules, sub-indexes extrapolate matching rows to granule granularity. This is different from regular skip indexes which +skip data at the granularity of index blocks. + +The `GRANULARITY` parameter determines how many ANN sub-indexes are created. Bigger `GRANULARITY` values mean fewer but larger ANN +sub-indexes, up to the point where a column (or a column part) has only a single sub-index. In that case, the sub-index has a "global" view of +all column rows and can directly return all granules of the column (part) with relevant rows (there are at at most `LIMIT `-many +such granules). In a second step, ClickHouse will load these granules and identify the actually best rows by performing a brute-force distance +calculation over all rows of the granules. With a small `GRANULARITY` value, each of the sub-indexes returns up to `LIMIT N`-many granules. +As a result, more granules need to be loaded and post-filtered. Note that the search accuracy is with both cases equally good, only the +processing performance differs. It is generally recommended to use a large `GRANULARITY` for ANN indexes and fall back to a smaller +`GRANULARITY` values only in case of problems like excessive memory consumption of the ANN structures. If no `GRANULARITY` was specified for +ANN indexes, the default value is 100 million. -An ANN index is only used if the query has a `LIMIT` value smaller than setting `max_limit_for_ann_queries` (default: 1 million rows). This is a safety measure which helps to avoid large memory consumption by external libraries for approximate neighbor search. # Available ANN Indexes @@ -106,51 +144,68 @@ An ANN index is only used if the query has a `LIMIT` value smaller than setting ## Annoy {#annoy} -(currently disabled on ARM due to memory safety problems with the algorithm) +Annoy indexes are currently experimental, to use them you first need to `SET allow_experimental_annoy_index = 1`. They are also currently +disabled on ARM due to memory safety problems with the algorithm. -This type of ANN index implements [the Annoy algorithm](https://github.com/spotify/annoy) which uses a recursive division of the space in random linear surfaces (lines in 2D, planes in 3D etc.). +This type of ANN index implements [the Annoy algorithm](https://github.com/spotify/annoy) which is based on a recursive division of the +space in random linear surfaces (lines in 2D, planes in 3D etc.). -Syntax to create a Annoy index over a `Array` column: +
+ +
+ +Syntax to create an Annoy index over an [Array](../../../sql-reference/data-types/array.md) column: ```sql CREATE TABLE table ( id Int64, - embedding Array(Float32), - INDEX embedding TYPE annoy([DistanceName[, NumTrees]]) GRANULARITY N + vectors Array(Float32), + INDEX vectors TYPE annoy([Distance[, NumTrees]]) [GRANULARITY N] ) ENGINE = MergeTree ORDER BY id; ``` -Syntax to create a Annoy index over a `Tuple` column: +Syntax to create an ANN index over a [Tuple](../../../sql-reference/data-types/tuple.md) column: ```sql CREATE TABLE table ( id Int64, - embedding Tuple(Float32[, Float32[, ...]]), - INDEX embedding TYPE annoy([DistanceName[, NumTrees]]) GRANULARITY N + vectors Tuple(Float32[, Float32[, ...]]), + INDEX vectors TYPE annoy([Distance[, NumTrees]]) [GRANULARITY N] ) ENGINE = MergeTree ORDER BY id; ``` -Parameter `DistanceName` is name of a distance function (default `L2Distance`). Annoy currently supports `L2Distance` and `cosineDistance` as distance functions. Parameter `NumTrees` (default: 100) is the number of trees which the algorithm will create. Higher values of `NumTree` mean slower `CREATE` and `SELECT` statements (approximately linearly), but increase the accuracy of search results. +Annoy currently supports `L2Distance` and `cosineDistance` as distance function `Distance`. If no distance function was specified during +index creation, `L2Distance` is used as default. Parameter `NumTrees` is the number of trees which the algorithm creates (default if not +specified: 100). Higher values of `NumTree` mean more accurate search results but slower index creation / query times (approximately +linearly) as well as larger index sizes. :::note -Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays **must** have same length. Use [CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints) to avoid errors. For example, `CONSTRAINT constraint_name_1 CHECK length(embedding) = 256`. +Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays **must** have same length. Use +[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints) to avoid errors. For example, `CONSTRAINT constraint_name_1 +CHECK length(vectors) = 256`. ::: -Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many tree nodes are inspected during SELECTs. It can be used to -balance runtime and accuracy at runtime. - -Example: +Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many tree nodes are inspected during SELECTs. Larger +values mean more accurate results at the cost of longer query runtime: ``` sql SELECT * FROM table_name [WHERE ...] -ORDER BY L2Distance(column, Point) +ORDER BY L2Distance(vectors, Point) LIMIT N SETTINGS annoy_index_search_k_nodes=100 ``` diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 07f706af91d..61276110138 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -491,7 +491,7 @@ Syntax: `tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, ran #### Special-purpose -- An experimental index to support approximate nearest neighbor (ANN) search. See [here](annindexes.md) for details. +- Experimental indexes to support approximate nearest neighbor (ANN) search. See [here](annindexes.md) for details. - An experimental inverted index to support full-text search. See [here](invertedindexes.md) for details. ### Functions Support {#functions-support} diff --git a/src/Parsers/ParserCreateIndexQuery.cpp b/src/Parsers/ParserCreateIndexQuery.cpp index 7323c5da141..57afd3fb99e 100644 --- a/src/Parsers/ParserCreateIndexQuery.cpp +++ b/src/Parsers/ParserCreateIndexQuery.cpp @@ -46,7 +46,16 @@ bool ParserCreateIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected index->part_of_create_index_query = true; index->set(index->expr, expr); index->set(index->type, type); - index->granularity = granularity ? granularity->as().value.safeGet() : 1; + + if (granularity) + index->granularity = granularity->as().value.safeGet(); + else + { + if (index->type->name == "annoy") + index->granularity = 100'000'000; + else + index->granularity = 1; + } node = index; return true; diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index f975e8ba3c8..c6273f369b1 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -141,7 +141,17 @@ bool ParserIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expe index->name = name->as().name(); index->set(index->expr, expr); index->set(index->type, type); - index->granularity = granularity ? granularity->as().value.safeGet() : 1; + + if (granularity) + index->granularity = granularity->as().value.safeGet(); + else + { + if (index->type->name == "annoy") + index->granularity = 100'000'000; + else + index->granularity = 1; + } + node = index; return true; diff --git a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp index bf277c55863..69e54dd5f0c 100644 --- a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp +++ b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp @@ -88,7 +88,7 @@ std::vector ApproximateNearestNeighborCondition::getReferenceVector() con throw Exception(ErrorCodes::LOGICAL_ERROR, "Reference vector was requested for useless or uninitialized index."); } -size_t ApproximateNearestNeighborCondition::getNumOfDimensions() const +size_t ApproximateNearestNeighborCondition::getDimensions() const { if (index_is_useful && query_information.has_value()) return query_information->reference_vector.size(); diff --git a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h index 4fb95c3f492..310890eba1e 100644 --- a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h +++ b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h @@ -90,8 +90,8 @@ public: /// Distance should be calculated regarding to referenceVector std::vector getReferenceVector() const; - /// Reference vector's dimension size - size_t getNumOfDimensions() const; + /// Reference vector's dimension count + size_t getDimensions() const; String getColumnName() const; diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 1a28f28f746..f77cfe4fed0 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -27,13 +27,13 @@ namespace ErrorCodes template -AnnoyIndexWithSerialization::AnnoyIndexWithSerialization(uint64_t dim) - : Base::AnnoyIndex(dim) +AnnoyIndexWithSerialization::AnnoyIndexWithSerialization(size_t dimensions) + : Base::AnnoyIndex(dimensions) { } template -void AnnoyIndexWithSerialization::serialize(WriteBuffer& ostr) const +void AnnoyIndexWithSerialization::serialize(WriteBuffer & ostr) const { chassert(Base::_built); writeIntBinary(Base::_s, ostr); @@ -43,11 +43,11 @@ void AnnoyIndexWithSerialization::serialize(WriteBuffer& ostr) const writeIntBinary(Base::_K, ostr); writeIntBinary(Base::_seed, ostr); writeVectorBinary(Base::_roots, ostr); - ostr.write(reinterpret_cast(Base::_nodes), Base::_s * Base::_n_nodes); + ostr.write(reinterpret_cast(Base::_nodes), Base::_s * Base::_n_nodes); } template -void AnnoyIndexWithSerialization::deserialize(ReadBuffer& istr) +void AnnoyIndexWithSerialization::deserialize(ReadBuffer & istr) { chassert(!Base::_built); readIntBinary(Base::_s, istr); @@ -69,7 +69,7 @@ void AnnoyIndexWithSerialization::deserialize(ReadBuffer& istr) } template -uint64_t AnnoyIndexWithSerialization::getNumOfDimensions() const +size_t AnnoyIndexWithSerialization::getDimensions() const { return Base::get_f(); } @@ -97,14 +97,14 @@ void MergeTreeIndexGranuleAnnoy::serializeBinary(WriteBuffer & ostr) c { /// Number of dimensions is required in the index constructor, /// so it must be written and read separately from the other part - writeIntBinary(index->getNumOfDimensions(), ostr); // write dimension + writeIntBinary(static_cast(index->getDimensions()), ostr); // write dimension index->serialize(ostr); } template void MergeTreeIndexGranuleAnnoy::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/) { - uint64_t dimension; + UInt64 dimension; readIntBinary(dimension, istr); index = std::make_shared>(dimension); index->deserialize(istr); @@ -114,7 +114,7 @@ template MergeTreeIndexAggregatorAnnoy::MergeTreeIndexAggregatorAnnoy( const String & index_name_, const Block & index_sample_block_, - uint64_t trees_) + UInt64 trees_) : index_name(index_name_) , index_sample_block(index_sample_block_) , trees(trees_) @@ -251,10 +251,10 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI const AnnoyIndexWithSerializationPtr annoy = granule->index; - if (ann_condition.getNumOfDimensions() != annoy->getNumOfDimensions()) + if (ann_condition.getDimensions() != annoy->getDimensions()) throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) " "does not match the dimension in the index ({})", - ann_condition.getNumOfDimensions(), annoy->getNumOfDimensions()); + ann_condition.getDimensions(), annoy->getDimensions()); std::vector neighbors; /// indexes of dots which were closest to the reference vector std::vector distances; @@ -281,7 +281,7 @@ std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI return granule_numbers; } -MergeTreeIndexAnnoy::MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t trees_, const String & distance_function_) +MergeTreeIndexAnnoy::MergeTreeIndexAnnoy(const IndexDescription & index_, UInt64 trees_, const String & distance_function_) : IMergeTreeIndex(index_) , trees(trees_) , distance_function(distance_function_) @@ -320,9 +320,9 @@ MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index) if (!index.arguments.empty()) distance_function = index.arguments[0].get(); - uint64_t trees = default_trees; + UInt64 trees = default_trees; if (index.arguments.size() > 1) - trees = index.arguments[1].get(); + trees = index.arguments[1].get(); return std::make_shared(index, trees, distance_function); } @@ -338,7 +338,7 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */) throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance function argument of Annoy index must be of type String"); if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::UInt64) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Number of trees argument of Annoy index must be UInt64"); + throw Exception(ErrorCodes::INCORRECT_QUERY, "Number of trees argument of Annoy index must be of type UInt64"); /// Check that the index is created on a single column @@ -351,17 +351,16 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */) { String distance_name = index.arguments[0].get(); if (distance_name != "L2Distance" && distance_name != "cosineDistance") - throw Exception(ErrorCodes::INCORRECT_DATA, "Annoy index supports only distance functions 'L2Distance' and 'cosineDistance'. Given distance function: {}", distance_name); + throw Exception(ErrorCodes::INCORRECT_DATA, "Annoy index only supports distance functions 'L2Distance' and 'cosineDistance'"); } /// Check data type of indexed column: - auto throw_unsupported_underlying_column_exception = [](DataTypePtr data_type) + auto throw_unsupported_underlying_column_exception = []() { throw Exception( ErrorCodes::ILLEGAL_COLUMN, - "Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32). Given type: {}", - data_type->getName()); + "Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32)"); }; DataTypePtr data_type = index.sample_block.getDataTypes()[0]; @@ -370,7 +369,7 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */) { TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId(); if (!WhichDataType(nested_type_index).isFloat32()) - throw_unsupported_underlying_column_exception(data_type); + throw_unsupported_underlying_column_exception(); } else if (const auto * data_type_tuple = typeid_cast(data_type.get())) { @@ -379,11 +378,11 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */) { TypeIndex nested_type_index = inner_type->getTypeId(); if (!WhichDataType(nested_type_index).isFloat32()) - throw_unsupported_underlying_column_exception(data_type); + throw_unsupported_underlying_column_exception(); } } else - throw_unsupported_underlying_column_exception(data_type); + throw_unsupported_underlying_column_exception(); } } diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index 5204ff07b27..cfc3b7519b8 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -16,10 +16,10 @@ class AnnoyIndexWithSerialization : public Annoy::AnnoyIndex; public: - explicit AnnoyIndexWithSerialization(uint64_t dim); - void serialize(WriteBuffer& ostr) const; - void deserialize(ReadBuffer& istr); - uint64_t getNumOfDimensions() const; + explicit AnnoyIndexWithSerialization(size_t dimensions); + void serialize(WriteBuffer & ostr) const; + void deserialize(ReadBuffer & istr); + size_t getDimensions() const; }; template @@ -46,7 +46,7 @@ struct MergeTreeIndexGranuleAnnoy final : public IMergeTreeIndexGranule template struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator { - MergeTreeIndexAggregatorAnnoy(const String & index_name_, const Block & index_sample_block, uint64_t trees); + MergeTreeIndexAggregatorAnnoy(const String & index_name_, const Block & index_sample_block, UInt64 trees); ~MergeTreeIndexAggregatorAnnoy() override = default; bool empty() const override { return !index || index->get_n_items() == 0; } @@ -55,7 +55,7 @@ struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator const String index_name; const Block index_sample_block; - const uint64_t trees; + const UInt64 trees; AnnoyIndexWithSerializationPtr index; }; @@ -89,7 +89,7 @@ class MergeTreeIndexAnnoy : public IMergeTreeIndex { public: - MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t trees_, const String & distance_function_); + MergeTreeIndexAnnoy(const IndexDescription & index_, UInt64 trees_, const String & distance_function_); ~MergeTreeIndexAnnoy() override = default; @@ -100,7 +100,7 @@ public: bool mayBenefitFromIndexForIn(const ASTPtr & /*node*/) const override { return false; } private: - const uint64_t trees; + const UInt64 trees; const String distance_function; }; diff --git a/tests/queries/0_stateless/02354_annoy_index.reference b/tests/queries/0_stateless/02354_annoy_index.reference index 45515bc7733..5e01a6e566e 100644 --- a/tests/queries/0_stateless/02354_annoy_index.reference +++ b/tests/queries/0_stateless/02354_annoy_index.reference @@ -1,118 +1,144 @@ ---- Test with Array --- -WHERE type, L2Distance -1 [0,0,10] -2 [0,0,10.5] -3 [0,0,9.5] -4 [0,0,9.7] -5 [0,0,10.2] -ORDER BY type, L2Distance -1 [0,0,10] -5 [0,0,10.2] -4 [0,0,9.7] -WHERE type, L2Distance, check that index is used -Expression ((Projection + Before ORDER BY)) - Limit (preliminary LIMIT (without OFFSET)) - ReadFromMergeTree (default.tab) - Indexes: - PrimaryKey - Condition: true - Parts: 1/1 - Granules: 3/3 - Skip - Name: annoy_index - Description: annoy GRANULARITY 1 - Parts: 1/1 - Granules: 1/3 -ORDER BY type, L2Distance, check that index is used -Expression (Projection) - Limit (preliminary LIMIT (without OFFSET)) - Sorting (Sorting for ORDER BY) - Expression (Before ORDER BY) - ReadFromMergeTree (default.tab) - Indexes: - PrimaryKey - Condition: true - Parts: 1/1 - Granules: 3/3 - Skip - Name: annoy_index - Description: annoy GRANULARITY 1 - Parts: 1/1 - Granules: 3/3 -parameter annoy_index_search_k_nodes -parameter max_limit_for_ann_queries -Expression (Projection) - Limit (preliminary LIMIT (without OFFSET)) - Sorting (Sorting for ORDER BY) - Expression (Before ORDER BY) - ReadFromMergeTree (default.tab) - Indexes: - PrimaryKey - Condition: true - Parts: 1/1 - Granules: 3/3 ---- Test with Tuple --- -WHERE type, L2Distance -1 (0,0,10) -2 (0,0,10.5) -3 (0,0,9.5) -4 (0,0,9.7) -5 (0,0,10.2) -ORDER BY type, L2Distance -1 (0,0,10) -5 (0,0,10.2) -4 (0,0,9.7) -WHERE type, L2Distance, check that index is used -Expression ((Projection + Before ORDER BY)) - Limit (preliminary LIMIT (without OFFSET)) - ReadFromMergeTree (default.tab) - Indexes: - PrimaryKey - Condition: true - Parts: 1/1 - Granules: 3/3 - Skip - Name: annoy_index - Description: annoy GRANULARITY 1 - Parts: 1/1 - Granules: 1/3 -ORDER BY type, L2Distance, check that index is used -Expression (Projection) - Limit (preliminary LIMIT (without OFFSET)) - Sorting (Sorting for ORDER BY) - Expression (Before ORDER BY) - ReadFromMergeTree (default.tab) - Indexes: - PrimaryKey - Condition: true - Parts: 1/1 - Granules: 3/3 - Skip - Name: annoy_index - Description: annoy GRANULARITY 1 - Parts: 1/1 - Granules: 3/3 -parameter annoy_index_search_k_nodes -parameter max_limit_for_ann_queries -Expression (Projection) - Limit (preliminary LIMIT (without OFFSET)) - Sorting (Sorting for ORDER BY) - Expression (Before ORDER BY) - ReadFromMergeTree (default.tab) - Indexes: - PrimaryKey - Condition: true - Parts: 1/1 - Granules: 3/3 ---- Test alternative metric (cosine distance) and non-default NumTrees --- -WHERE type, L2Distance -1 [0,0,10] -2 [0,0,10.5] -3 [0,0,9.5] -4 [0,0,9.7] -5 [0,0,10.2] -ORDER BY type, L2Distance -1 [0,0,10] -5 [0,0,10.2] -4 [0,0,9.7] --- Negative tests --- +--- Test default GRANULARITY (should be 100 mio. for annoy)--- +CREATE TABLE default.tab\n(\n `id` Int32,\n `vector` Array(Float32),\n INDEX annoy_index vector TYPE annoy GRANULARITY 100000000\n)\nENGINE = MergeTree\nORDER BY id\nSETTINGS index_granularity = 8192 +CREATE TABLE default.tab\n(\n `id` Int32,\n `vector` Array(Float32),\n INDEX annoy_index vector TYPE annoy GRANULARITY 100000000\n)\nENGINE = MergeTree\nORDER BY id\nSETTINGS index_granularity = 8192 +--- Test with Array, GRANULARITY = 1, index_granularity = 5 --- +WHERE type, L2Distance, check that index is used +Expression ((Projection + Before ORDER BY)) + Limit (preliminary LIMIT (without OFFSET)) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 3/3 + Skip + Name: annoy_index + Description: annoy GRANULARITY 1 + Parts: 1/1 + Granules: 1/3 +ORDER BY type, L2Distance, check that index is used +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 3/3 + Skip + Name: annoy_index + Description: annoy GRANULARITY 1 + Parts: 1/1 + Granules: 3/3 +Reference ARRAYs with non-matching dimension are rejected +Special case: MaximumDistance is negative +WHERE type, L2Distance +Special case: setting annoy_index_search_k_nodes +Special case: setting max_limit_for_ann_queries +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 3/3 +--- Test with Tuple, GRANULARITY = 1, index_granularity = 5 --- +WHERE type, L2Distance, check that index is used +Expression ((Projection + Before ORDER BY)) + Limit (preliminary LIMIT (without OFFSET)) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 3/3 + Skip + Name: annoy_index + Description: annoy GRANULARITY 1 + Parts: 1/1 + Granules: 1/3 +ORDER BY type, L2Distance, check that index is used +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 3/3 + Skip + Name: annoy_index + Description: annoy GRANULARITY 1 + Parts: 1/1 + Granules: 3/3 +--- Test non-default metric (cosine distance) + non-default NumTrees (200) --- +--- Test with Array, GRANULARITY = 2, index_granularity = 4 --- +WHERE type, L2Distance, check that index is used +Expression ((Projection + Before ORDER BY)) + Limit (preliminary LIMIT (without OFFSET)) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 4/4 + Skip + Name: annoy_index + Description: annoy GRANULARITY 2 + Parts: 0/1 + Granules: 2/4 +ORDER BY type, L2Distance, check that index is used +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 4/4 + Skip + Name: annoy_index + Description: annoy GRANULARITY 2 + Parts: 1/1 + Granules: 4/4 +--- Test with Array, GRANULARITY = 4, index_granularity = 4 --- +WHERE type, L2Distance, check that index is used +Expression ((Projection + Before ORDER BY)) + Limit (preliminary LIMIT (without OFFSET)) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 4/4 + Skip + Name: annoy_index + Description: annoy GRANULARITY 4 + Parts: 0/1 + Granules: 3/4 +ORDER BY type, L2Distance, check that index is used +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 4/4 + Skip + Name: annoy_index + Description: annoy GRANULARITY 4 + Parts: 1/1 + Granules: 4/4 diff --git a/tests/queries/0_stateless/02354_annoy_index.sql b/tests/queries/0_stateless/02354_annoy_index.sql index abee5e8a6e4..fefb51dfcc9 100644 --- a/tests/queries/0_stateless/02354_annoy_index.sql +++ b/tests/queries/0_stateless/02354_annoy_index.sql @@ -1,150 +1,251 @@ --- Tags: disabled, no-fasttest, no-ubsan, no-cpu-aarch64, no-upgrade-check +-- Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-upgrade-check SET allow_experimental_annoy_index = 1; - -SELECT '--- Test with Array ---'; - -DROP TABLE IF EXISTS tab; -CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity=5; -INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]); - -SELECT 'WHERE type, L2Distance'; -SELECT * -FROM tab -WHERE L2Distance(embedding, [0.0, 0.0, 10.0]) < 1.0 -LIMIT 5; - -SELECT 'ORDER BY type, L2Distance'; -SELECT * -FROM tab -ORDER BY L2Distance(embedding, [0.0, 0.0, 10.0]) -LIMIT 3; - --- Produces different error code with analyzer, TODO: check --- SELECT 'Reference ARRAYs with non-matching dimension are rejected'; --- SELECT * --- FROM tab --- ORDER BY L2Distance(embedding, [0.0, 0.0]) --- LIMIT 3; -- { serverError INCORRECT_QUERY } - -SELECT 'WHERE type, L2Distance, check that index is used'; -EXPLAIN indexes=1 -SELECT * -FROM tab -WHERE L2Distance(embedding, [0.0, 0.0, 10.0]) < 1.0 -LIMIT 5; - -SELECT 'ORDER BY type, L2Distance, check that index is used'; -EXPLAIN indexes=1 -SELECT * -FROM tab -ORDER BY L2Distance(embedding, [0.0, 0.0, 10.0]) -LIMIT 3; - -SELECT 'parameter annoy_index_search_k_nodes'; -SELECT * -FROM tab -ORDER BY L2Distance(embedding, [5.3, 7.3, 2.1]) -LIMIT 5 -SETTINGS annoy_index_search_k_nodes=0; -- searches zero nodes --> no results - -SELECT 'parameter max_limit_for_ann_queries'; -EXPLAIN indexes=1 -SELECT * -FROM tab -ORDER BY L2Distance(embedding, [5.3, 7.3, 2.1]) -LIMIT 5 -SETTINGS max_limit_for_ann_queries=2; -- doesn't use the ann index - -DROP TABLE tab; - -SELECT '--- Test with Tuple ---'; - -CREATE TABLE tab(id Int32, embedding Tuple(Float32, Float32, Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity=5; -INSERT INTO tab VALUES (1, (0.0, 0.0, 10.0)), (2, (0.0, 0.0, 10.5)), (3, (0.0, 0.0, 9.5)), (4, (0.0, 0.0, 9.7)), (5, (0.0, 0.0, 10.2)), (6, (10.0, 0.0, 0.0)), (7, (9.5, 0.0, 0.0)), (8, (9.7, 0.0, 0.0)), (9, (10.2, 0.0, 0.0)), (10, (10.5, 0.0, 0.0)), (11, (0.0, 10.0, 0.0)), (12, (0.0, 9.5, 0.0)), (13, (0.0, 9.7, 0.0)), (14, (0.0, 10.2, 0.0)), (15, (0.0, 10.5, 0.0)); - -SELECT 'WHERE type, L2Distance'; -SELECT * -FROM tab -WHERE L2Distance(embedding, (0.0, 0.0, 10.0)) < 1.0 -LIMIT 5; - -SELECT 'ORDER BY type, L2Distance'; -SELECT * -FROM tab -ORDER BY L2Distance(embedding, (0.0, 0.0, 10.0)) -LIMIT 3; - -SELECT 'WHERE type, L2Distance, check that index is used'; -EXPLAIN indexes=1 -SELECT * -FROM tab -WHERE L2Distance(embedding, (0.0, 0.0, 10.0)) < 1.0 -LIMIT 5; - -SELECT 'ORDER BY type, L2Distance, check that index is used'; -EXPLAIN indexes=1 -SELECT * -FROM tab -ORDER BY L2Distance(embedding, (0.0, 0.0, 10.0)) -LIMIT 3; - -SELECT 'parameter annoy_index_search_k_nodes'; -SELECT * -FROM tab -ORDER BY L2Distance(embedding, (5.3, 7.3, 2.1)) -LIMIT 5 -SETTINGS annoy_index_search_k_nodes=0; -- searches zero nodes --> no results - -SELECT 'parameter max_limit_for_ann_queries'; -EXPLAIN indexes=1 -SELECT * -FROM tab -ORDER BY L2Distance(embedding, (5.3, 7.3, 2.1)) -LIMIT 5 -SETTINGS max_limit_for_ann_queries=2; -- doesn't use the ann index - -DROP TABLE tab; - -SELECT '--- Test alternative metric (cosine distance) and non-default NumTrees ---'; - -CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy('cosineDistance', 200)) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity=5; -INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]); - -SELECT 'WHERE type, L2Distance'; -SELECT * -FROM tab -WHERE L2Distance(embedding, [0.0, 0.0, 10.0]) < 1.0 -LIMIT 5; - -SELECT 'ORDER BY type, L2Distance'; -SELECT * -FROM tab -ORDER BY L2Distance(embedding, [0.0, 0.0, 10.0]) -LIMIT 3; - -DROP TABLE tab; +SET allow_experimental_analyzer = 0; SELECT '--- Negative tests ---'; +DROP TABLE IF EXISTS tab; + -- must have at most 2 arguments -CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy('too', 'many', 'arguments')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy('too', 'many', 'arguments')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } -- first argument (distance_function) must be String -CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } -- 2nd argument (number of trees) must be UInt64 -CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy('L2Distance', 'not an UInt64')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } - --- reject unsupported distance functions -CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy('wormholeDistance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy('L2Distance', 'not an UInt64')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } -- must be created on single column -CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index (embedding, id) TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS } +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index (vector, id) TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS } + +-- reject unsupported distance functions +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy('wormholeDistance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } -- must be created on Array/Tuple(Float32) columns SET allow_suspicious_low_cardinality_types = 1; -CREATE TABLE tab(id Int32, embedding Float32, INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } -CREATE TABLE tab(id Int32, embedding Array(Float64), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } -CREATE TABLE tab(id Int32, embedding LowCardinality(Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } -CREATE TABLE tab(id Int32, embedding Nullable(Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vector Float32, INDEX annoy_index vector TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vector Array(Float64), INDEX annoy_index vector TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vector Tuple(Float64), INDEX annoy_index vector TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vector LowCardinality(Float32), INDEX annoy_index vector TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vector Nullable(Float32), INDEX annoy_index vector TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } + +SELECT '--- Test default GRANULARITY (should be 100 mio. for annoy)---'; + +CREATE TABLE tab (id Int32, vector Array(Float32), INDEX annoy_index(vector) TYPE annoy) ENGINE=MergeTree ORDER BY id; +SHOW CREATE TABLE tab; +DROP TABLE tab; + +CREATE TABLE tab (id Int32, vector Array(Float32)) ENGINE=MergeTree ORDER BY id; +ALTER TABLE tab ADD INDEX annoy_index(vector) TYPE annoy; +SHOW CREATE TABLE tab; + +DROP TABLE tab; + +SELECT '--- Test with Array, GRANULARITY = 1, index_granularity = 5 ---'; + +DROP TABLE IF EXISTS tab; +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy() GRANULARITY 1) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5; +INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]); + +-- rows = 15, index_granularity = 5, GRANULARITY = 1 gives 3 annoy-indexed blocks (each comprising a single granule) +-- condition 'L2Distance(vector, reference_vector) < 1.0' ensures that only one annoy-indexed block produces results --> "Granules: 1/3" + +-- See (*) why commented out +-- SELECT 'WHERE type, L2Distance'; +-- SELECT * +-- FROM tab +-- WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < 1.0 +-- LIMIT 3; + +SELECT 'WHERE type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < 1.0 +LIMIT 3; + +-- See (*) why commented out +-- SELECT 'ORDER BY type, L2Distance'; +-- SELECT * +-- FROM tab +-- ORDER BY L2Distance(vector, [0.0, 0.0, 10.0]) +-- LIMIT 3; + +SELECT 'ORDER BY type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +ORDER BY L2Distance(vector, [0.0, 0.0, 10.0]) +LIMIT 3; + +-- Test special cases. Corresponding special case tests are omitted from later tests. + +SELECT 'Reference ARRAYs with non-matching dimension are rejected'; +SELECT * +FROM tab +ORDER BY L2Distance(vector, [0.0, 0.0]) +LIMIT 3; -- { serverError INCORRECT_QUERY } + +SELECT 'Special case: MaximumDistance is negative'; +SELECT 'WHERE type, L2Distance'; +SELECT * +FROM tab +WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < -1.0 +LIMIT 3; -- { serverError INCORRECT_QUERY } + +SELECT 'Special case: setting annoy_index_search_k_nodes'; +SELECT * +FROM tab +ORDER BY L2Distance(vector, [5.3, 7.3, 2.1]) +LIMIT 3 +SETTINGS annoy_index_search_k_nodes=0; -- searches zero nodes --> no results + +SELECT 'Special case: setting max_limit_for_ann_queries'; +EXPLAIN indexes=1 +SELECT * +FROM tab +ORDER BY L2Distance(vector, [5.3, 7.3, 2.1]) +LIMIT 3 +SETTINGS max_limit_for_ann_queries=2; -- doesn't use the ann index + +DROP TABLE tab; + +-- Test Tuple embeddings. Triggers different logic than Array inside MergeTreeIndexAnnoy but the same logic as Array above MergeTreeIndexAnnoy. +-- Therefore test Tuple case just once. + +SELECT '--- Test with Tuple, GRANULARITY = 1, index_granularity = 5 ---'; + +CREATE TABLE tab(id Int32, vector Tuple(Float32, Float32, Float32), INDEX annoy_index vector TYPE annoy() GRANULARITY 1) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5; +INSERT INTO tab VALUES (1, (0.0, 0.0, 10.0)), (2, (0.0, 0.0, 10.5)), (3, (0.0, 0.0, 9.5)), (4, (0.0, 0.0, 9.7)), (5, (0.0, 0.0, 10.2)), (6, (10.0, 0.0, 0.0)), (7, (9.5, 0.0, 0.0)), (8, (9.7, 0.0, 0.0)), (9, (10.2, 0.0, 0.0)), (10, (10.5, 0.0, 0.0)), (11, (0.0, 10.0, 0.0)), (12, (0.0, 9.5, 0.0)), (13, (0.0, 9.7, 0.0)), (14, (0.0, 10.2, 0.0)), (15, (0.0, 10.5, 0.0)); + +-- See (*) why commented out +-- SELECT 'WHERE type, L2Distance'; +-- SELECT * +-- FROM tab +-- WHERE L2Distance(vector, (0.0, 0.0, 10.0)) < 1.0 +-- LIMIT 3; + +SELECT 'WHERE type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +WHERE L2Distance(vector, (0.0, 0.0, 10.0)) < 1.0 +LIMIT 3; + +-- See (*) why commented out +-- SELECT 'ORDER BY type, L2Distance'; +-- SELECT * +-- FROM tab +-- ORDER BY L2Distance(vector, (0.0, 0.0, 10.0)) +-- LIMIT 3; + +SELECT 'ORDER BY type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +ORDER BY L2Distance(vector, (0.0, 0.0, 10.0)) +LIMIT 3; + +DROP TABLE tab; + +-- Not a systematic test, just to make sure no bad things happen +SELECT '--- Test non-default metric (cosine distance) + non-default NumTrees (200) ---'; + +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy('cosineDistance', 200) GRANULARITY 1) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5; +INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]); + +-- See (*) why commented out +-- SELECT 'WHERE type, L2Distance'; +-- SELECT * +-- FROM tab +-- WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < 1.0 +-- LIMIT 3; + +-- See (*) why commented out +-- SELECT 'ORDER BY type, L2Distance'; +-- SELECT * +-- FROM tab +-- ORDER BY L2Distance(vector, [0.0, 0.0, 10.0]) +-- LIMIT 3; + +DROP TABLE tab; + +SELECT '--- Test with Array, GRANULARITY = 2, index_granularity = 4 ---'; + +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy() GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 4; +INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0, 0.0]), (2, [0.0, 0.0, 10.5, 0.0]), (3, [0.0, 0.0, 9.5, 0.0]), (4, [0.0, 0.0, 9.7, 0.0]), (5, [10.0, 0.0, 0.0, 0.0]), (6, [9.5, 0.0, 0.0, 0.0]), (7, [9.7, 0.0, 0.0, 0.0]), (8, [10.2, 0.0, 0.0, 0.0]), (9, [0.0, 10.0, 0.0, 0.0]), (10, [0.0, 9.5, 0.0, 0.0]), (11, [0.0, 9.7, 0.0, 0.0]), (12, [0.0, 9.7, 0.0, 0.0]), (13, [0.0, 0.0, 0.0, 10.3]), (14, [0.0, 0.0, 0.0, 9.5]), (15, [0.0, 0.0, 0.0, 10.0]), (16, [0.0, 0.0, 0.0, 10.5]); + +-- rows = 16, index_granularity = 4, GRANULARITY = 2 gives 2 annoy-indexed blocks (each comprising two granules) +-- condition 'L2Distance(vector, reference_vector) < 1.0' ensures that only one annoy-indexed block produces results --> "Granules: 2/4" + +-- See (*) why commented out +-- SELECT 'WHERE type, L2Distance'; +-- SELECT * +-- FROM tab +-- WHERE L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) < 5.0 +-- LIMIT 3; + +SELECT 'WHERE type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +WHERE L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) < 5.0 +LIMIT 3; + +-- See (*) why commented out +-- SELECT 'ORDER BY type, L2Distance'; +-- SELECT * +-- FROM tab +-- ORDER BY L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) +-- LIMIT 3; + +SELECT 'ORDER BY type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +ORDER BY L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) +LIMIT 3; + +DROP TABLE tab; + +SELECT '--- Test with Array, GRANULARITY = 4, index_granularity = 4 ---'; + +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy() GRANULARITY 4) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 4; +INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0, 0.0]), (2, [0.0, 0.0, 10.5, 0.0]), (3, [0.0, 0.0, 9.5, 0.0]), (4, [0.0, 0.0, 9.7, 0.0]), (5, [10.0, 0.0, 0.0, 0.0]), (6, [9.5, 0.0, 0.0, 0.0]), (7, [9.7, 0.0, 0.0, 0.0]), (8, [10.2, 0.0, 0.0, 0.0]), (9, [0.0, 10.0, 0.0, 0.0]), (10, [0.0, 9.5, 0.0, 0.0]), (11, [0.0, 9.7, 0.0, 0.0]), (12, [0.0, 9.7, 0.0, 0.0]), (13, [0.0, 0.0, 0.0, 10.3]), (14, [0.0, 0.0, 0.0, 9.5]), (15, [0.0, 0.0, 0.0, 10.0]), (16, [0.0, 0.0, 0.0, 10.5]); + +-- rows = 16, index_granularity = 4, GRANULARITY = 4 gives a single annoy-indexed block (comprising all granules) +-- no two matches happen to be located in the same granule, so with LIMIT = 3, we'll get "Granules: 2/4" + +-- See (*) why commented out +-- SELECT 'WHERE type, L2Distance'; +-- SELECT * +-- FROM tab +-- WHERE L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) < 5.0 +-- LIMIT 3; + +SELECT 'WHERE type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +WHERE L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) < 5.0 +LIMIT 3; + +-- See (*) why commented out +-- SELECT 'ORDER BY type, L2Distance'; +-- SELECT * +-- FROM tab +-- ORDER BY L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) +-- LIMIT 3; + +SELECT 'ORDER BY type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +ORDER BY L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) +LIMIT 3; + +DROP TABLE tab; + +-- (*) Storage and search in Annoy indexes is inherently random. Tests which check for exact row matches would be unstable. Therefore, +-- comment them out. diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index d6cef1883f4..021855e399f 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -26,6 +26,7 @@ AlertManager Alexey AnyEvent AppleClang +Approximative ArrayJoin ArrowStream AsyncInsertCacheSize @@ -1005,6 +1006,7 @@ anyLast anyheavy anylast appendTrailingCharIfAbsent +approximative argMax argMin argmax @@ -2419,6 +2421,7 @@ unescaping unhex unicode unidimensional +unintuitive uniq uniqCombined uniqExact From b6e32cd5e3ba632805905c2c1ea8493471053f9f Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 8 Jun 2023 15:30:23 +0000 Subject: [PATCH 0672/1072] Do not read all the columns from right GLOBAL JOIN table. --- src/Interpreters/ExpressionAnalyzer.cpp | 11 ++------ src/Interpreters/GlobalSubqueriesVisitor.h | 28 +++++++++++++++---- src/Interpreters/TableJoin.cpp | 9 ++++++ src/Interpreters/TableJoin.h | 2 ++ ...785_global_join_too_many_columns.reference | 1 + .../02785_global_join_too_many_columns.sql | 14 ++++++++++ 6 files changed, 51 insertions(+), 14 deletions(-) create mode 100644 tests/queries/0_stateless/02785_global_join_too_many_columns.reference create mode 100644 tests/queries/0_stateless/02785_global_join_too_many_columns.sql diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index c7c66f6f414..307b46b3a0b 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -444,7 +444,7 @@ void ExpressionAnalyzer::initGlobalSubqueriesAndExternalTables(bool do_global, b if (do_global) { GlobalSubqueriesVisitor::Data subqueries_data( - getContext(), subquery_depth, isRemoteStorage(), is_explain, external_tables, prepared_sets, has_global_subqueries); + getContext(), subquery_depth, isRemoteStorage(), is_explain, external_tables, prepared_sets, has_global_subqueries, syntax->analyzed_join.get()); GlobalSubqueriesVisitor(subqueries_data).visit(query); } } @@ -1056,13 +1056,6 @@ JoinPtr SelectQueryExpressionAnalyzer::appendJoin( return join; } -static ActionsDAGPtr createJoinedBlockActions(ContextPtr context, const TableJoin & analyzed_join) -{ - ASTPtr expression_list = analyzed_join.rightKeysList(); - auto syntax_result = TreeRewriter(context).analyze(expression_list, analyzed_join.columnsFromJoinedTable()); - return ExpressionAnalyzer(expression_list, syntax_result, context).getActionsDAG(true, false); -} - std::shared_ptr tryKeyValueJoin(std::shared_ptr analyzed_join, const Block & right_sample_block); @@ -1144,7 +1137,7 @@ static std::unique_ptr buildJoinedPlan( SelectQueryOptions query_options) { /// Actions which need to be calculated on joined block. - auto joined_block_actions = createJoinedBlockActions(context, analyzed_join); + auto joined_block_actions = analyzed_join.createJoinedBlockActions(context); NamesWithAliases required_columns_with_aliases = analyzed_join.getRequiredColumns( Block(joined_block_actions->getResultColumns()), joined_block_actions->getRequiredColumns().getNames()); diff --git a/src/Interpreters/GlobalSubqueriesVisitor.h b/src/Interpreters/GlobalSubqueriesVisitor.h index 2901f2e23d0..f5b837fc7f7 100644 --- a/src/Interpreters/GlobalSubqueriesVisitor.h +++ b/src/Interpreters/GlobalSubqueriesVisitor.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,7 @@ public: TemporaryTablesMapping & external_tables; PreparedSetsPtr prepared_sets; bool & has_global_subqueries; + TableJoin * table_join; Data( ContextPtr context_, @@ -51,7 +53,8 @@ public: bool is_explain_, TemporaryTablesMapping & tables, PreparedSetsPtr prepared_sets_, - bool & has_global_subqueries_) + bool & has_global_subqueries_, + TableJoin * table_join_) : WithContext(context_) , subquery_depth(subquery_depth_) , is_remote(is_remote_) @@ -59,10 +62,11 @@ public: , external_tables(tables) , prepared_sets(prepared_sets_) , has_global_subqueries(has_global_subqueries_) + , table_join(table_join_) { } - void addExternalStorage(ASTPtr & ast, bool set_alias = false) + void addExternalStorage(ASTPtr & ast, const Names & required_columns, bool set_alias = false) { /// With nondistributed queries, creating temporary tables does not make sense. if (!is_remote) @@ -145,7 +149,7 @@ public: if (external_tables.contains(external_table_name)) return; - auto interpreter = interpretSubquery(subquery_or_table_name, getContext(), subquery_depth, {}); + auto interpreter = interpretSubquery(subquery_or_table_name, getContext(), subquery_depth, required_columns); Block sample = interpreter->getSampleBlock(); NamesAndTypesList columns = sample.getNamesAndTypesList(); @@ -238,7 +242,7 @@ private: return; } - data.addExternalStorage(ast); + data.addExternalStorage(ast, {}); data.has_global_subqueries = true; } } @@ -249,7 +253,21 @@ private: if (table_elem.table_join && (table_elem.table_join->as().locality == JoinLocality::Global || shouldBeExecutedGlobally(data))) { - data.addExternalStorage(table_elem.table_expression, true); + Names required_columns; + + /// Fill required columns for GLOBAL JOIN. + /// This code is partial copy-paste from ExpressionAnalyzer. + if (data.table_join) + { + auto joined_block_actions = data.table_join->createJoinedBlockActions(data.getContext()); + NamesWithAliases required_columns_with_aliases = data.table_join->getRequiredColumns( + Block(joined_block_actions->getResultColumns()), joined_block_actions->getRequiredColumns().getNames()); + + for (auto & pr : required_columns_with_aliases) + required_columns.push_back(pr.first); + } + + data.addExternalStorage(table_elem.table_expression, required_columns, true); data.has_global_subqueries = true; } } diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index 5a23fbd00ff..c56d1e6039e 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -14,6 +14,8 @@ #include #include +#include +#include #include #include @@ -760,4 +762,11 @@ bool TableJoin::allowParallelHashJoin() const return true; } +ActionsDAGPtr TableJoin::createJoinedBlockActions(ContextPtr context) +{ + ASTPtr expression_list = rightKeysList(); + auto syntax_result = TreeRewriter(context).analyze(expression_list, columnsFromJoinedTable()); + return ExpressionAnalyzer(expression_list, syntax_result, context).getActionsDAG(true, false); +} + } diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index 6737cd8f13a..096e58d1292 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -217,6 +217,8 @@ public: const SizeLimits & sizeLimits() const { return size_limits; } VolumePtr getGlobalTemporaryVolume() { return tmp_volume; } + ActionsDAGPtr createJoinedBlockActions(ContextPtr context); + bool isEnabledAlgorithm(JoinAlgorithm val) const { /// When join_algorithm = 'default' (not specified by user) we use hash or direct algorithm. diff --git a/tests/queries/0_stateless/02785_global_join_too_many_columns.reference b/tests/queries/0_stateless/02785_global_join_too_many_columns.reference new file mode 100644 index 00000000000..425151f3a41 --- /dev/null +++ b/tests/queries/0_stateless/02785_global_join_too_many_columns.reference @@ -0,0 +1 @@ +40 diff --git a/tests/queries/0_stateless/02785_global_join_too_many_columns.sql b/tests/queries/0_stateless/02785_global_join_too_many_columns.sql new file mode 100644 index 00000000000..a49aae25f1c --- /dev/null +++ b/tests/queries/0_stateless/02785_global_join_too_many_columns.sql @@ -0,0 +1,14 @@ +drop table if exists local; +drop table if exists distr; + +create table local (a UInt64, b UInt64, c UInt64, d UInt64, e UInt64, f UInt64, g UInt64, h UInt64) engine = Log; +create table distr as local engine = Distributed('test_cluster_two_shards', currentDatabase(), local); + +insert into local (a) select number from numbers(10); + +set max_columns_to_read=1; +select count() from distr as l global all left join distr as r on l.a = r.a; + +drop table if exists local; +drop table if exists distr; + From 01873e9e6d9aeaaa8ae8dd2e78bd9f75ea5c70e6 Mon Sep 17 00:00:00 2001 From: Jordi Villar Date: Thu, 8 Jun 2023 14:09:01 +0200 Subject: [PATCH 0673/1072] Add async insert system tables documentation --- .../settings.md | 26 ++++++++ .../system-tables/asynchronous_insert_log.md | 63 +++++++++++++++++++ .../system-tables/asynchronous_inserts.md | 45 +++++++++++++ 3 files changed, 134 insertions(+) create mode 100644 docs/en/operations/system-tables/asynchronous_insert_log.md create mode 100644 docs/en/operations/system-tables/asynchronous_inserts.md diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index f93ab264511..3398c7afb67 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -1881,6 +1881,32 @@ The default server configuration file `config.xml` contains the following settin ``` +## asynchronous_insert_log {#server_configuration_parameters-asynchronous_insert_log} + +Settings for the [asynchronous_insert_log](../../operations/system-tables/asynchronous_insert_log.md#system_tables-asynchronous_insert_log) system table for logging async inserts. + +Parameters: + +- `database` — Database name. +- `table` — Table name. +- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. +- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined. +- `flush_interval_milliseconds` — Interval for flushing data from the buffer in memory to the table. +- `storage_policy` – Name of storage policy to use for the table (optional) + +**Example** +```xml + + + system +
asynchronous_insert_log
+ 7500 + toYYYYMM(event_date) + + + +``` + ## query_masking_rules {#query-masking-rules} Regexp-based rules, which will be applied to queries as well as all log messages before storing them in server logs, diff --git a/docs/en/operations/system-tables/asynchronous_insert_log.md b/docs/en/operations/system-tables/asynchronous_insert_log.md new file mode 100644 index 00000000000..8b0509d7000 --- /dev/null +++ b/docs/en/operations/system-tables/asynchronous_insert_log.md @@ -0,0 +1,63 @@ +--- +slug: /en/operations/system-tables/asynchronous_insert_log +--- +# asynchronous_insert_log + +Contains information about async inserts. Each entry represents an insert query buffered into an async insert query. + +To start logging configure parameters in the [asynchronous_insert_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-asynchronous_insert_log) section. + +The flushing period of data is set in `flush_interval_milliseconds` parameter of the [asynchronous_insert_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-asynchronous_insert_log) server settings section. To force flushing, use the [SYSTEM FLUSH LOGS](../../sql-reference/statements/system.md#query_language-system-flush_logs) query. + +ClickHouse does not delete data from the table automatically. See [Introduction](../../operations/system-tables/index.md#system-tables-introduction) for more details. + +Columns: + +- `event_date` ([Date](../../sql-reference/data-types/date.md)) — The date when the async insert happened. +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — The date and time when the async insert finished execution. +- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — The date and time when the async insert finished execution with microseconds precision. +- `query` ([String](../../sql-reference/data-types/string.md)) — Query string. +- `database` ([String](../../sql-reference/data-types/string.md)) — The name of the database the table is in. +- `table` ([String](../../sql-reference/data-types/string.md)) — Table name. +- `format` ([String](/docs/en/sql-reference/data-types/string.md)) — Format name. +- `query_id` ([String](../../sql-reference/data-types/string.md)) — ID of the initial query. +- `bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of inserted bytes. +- `exception` ([String](../../sql-reference/data-types/string.md)) — Exception message. +- `status` ([Enum8](../../sql-reference/data-types/enum.md)) — Status of the view. Values: + - `'Ok' = 1` — Successful insert. + - `'ParsingError' = 2` — Exception when parsing the data. + - `'FlushError' = 3` — Exception when flushing the data. +- `flush_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — The date and time when the flush happened. +- `flush_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — The date and time when the flush happened with microseconds precision. +- `flush_query_id` ([String](../../sql-reference/data-types/string.md)) — ID of the flush query. + +**Example** + +Query: + +``` sql +SELECT * FROM system.asynchronous_insert_log LIMIT 1 \G; +``` + +Result: + +``` text +event_date: 2023-06-08 +event_time: 2023-06-08 10:08:53 +event_time_microseconds: 2023-06-08 10:08:53.199516 +query: INSERT INTO public.data_guess (user_id, datasource_id, timestamp, path, type, num, str) FORMAT CSV +database: public +table: data_guess +format: CSV +query_id: b46cd4c4-0269-4d0b-99f5-d27668c6102e +bytes: 133223 +exception: +status: Ok +flush_time: 2023-06-08 10:08:55 +flush_time_microseconds: 2023-06-08 10:08:55.139676 +flush_query_id: cd2c1e43-83f5-49dc-92e4-2fbc7f8d3716 + +**See Also** + +- [system.query_log](../../operations/system-tables/query_log.md#system_tables-query_log) — Description of the `query_log` system table which contains common information about queries execution. +- [system.asynchronous_inserts](../../operations/system-tables/asynchronous_inserts.md#system_tables-asynchronous_inserts) — This table contains information about pending asynchronous inserts in queue. diff --git a/docs/en/operations/system-tables/asynchronous_inserts.md b/docs/en/operations/system-tables/asynchronous_inserts.md new file mode 100644 index 00000000000..8fd5f2bb520 --- /dev/null +++ b/docs/en/operations/system-tables/asynchronous_inserts.md @@ -0,0 +1,45 @@ +--- +slug: /en/operations/system-tables/asynchronous_inserts +--- +# asynchronous_inserts + +Contains information about pending asynchronous inserts in queue. + +Columns: + +- `query` ([String](../../sql-reference/data-types/string.md)) — Query string. +- `database` ([String](../../sql-reference/data-types/string.md)) — The name of the database the table is in. +- `table` ([String](../../sql-reference/data-types/string.md)) — Table name. +- `format` ([String](/docs/en/sql-reference/data-types/string.md)) — Format name. +- `first_update` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — First insert time with microseconds resolution. +- `total_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Total number of bytes waiting in the queue. +- `entries.query_id` ([Array(String)](../../sql-reference/data-types/array.md)) - Array of query ids of the inserts waiting in the queue. +- `entries.bytes` ([Array(UInt64)](../../sql-reference/data-types/array.md)) - Array of bytes of each insert query waiting in the queue. + +**Example** + +Query: + +``` sql +SELECT * FROM system.asynchronous_inserts LIMIT 1 \G; +``` + +Result: + +``` text +Row 1: +────── +query: INSERT INTO public.data_guess (user_id, datasource_id, timestamp, path, type, num, str) FORMAT CSV +database: public +table: data_guess +format: CSV +first_update: 2023-06-08 10:08:54.199606 +total_bytes: 133223 +entries.query_id: ['b46cd4c4-0269-4d0b-99f5-d27668c6102e'] +entries.bytes: [133223] +``` + +**See Also** + +- [system.query_log](../../operations/system-tables/query_log.md#system_tables-query_log) — Description of the `query_log` system table which contains common information about queries execution. +- [system.asynchronous_insert_log](../../operations/system-tables/asynchronous_insert_log.md#system_tables-asynchronous_insert_log) — This table contains information about async inserts performed. From ea9d0f6c3c94ae6fef1bb552958455c7ab9b25e7 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 8 Jun 2023 18:18:12 +0200 Subject: [PATCH 0674/1072] Fix --- .../IO/CachedOnDiskReadBufferFromFile.cpp | 55 +++++++++---------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index 6bf72434580..6317aba20e9 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -1038,34 +1038,6 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep() current_file_segment_counters.increment(ProfileEvents::FileSegmentUsedBytes, available()); - // No necessary because of the SCOPE_EXIT above, but useful for logging below. - if (download_current_segment) - file_segment.completePartAndResetDownloader(); - - chassert(!file_segment.isDownloader()); - - LOG_TEST( - log, - "Key: {}. Returning with {} bytes, buffer position: {} (offset: {}, predownloaded: {}), " - "buffer available: {}, current range: {}, file offset of buffer end: {}, impl offset: {}, file segment state: {}, " - "current write offset: {}, read_type: {}, reading until position: {}, started with offset: {}, " - "remaining ranges: {}", - cache_key.toString(), - working_buffer.size(), - getPosition(), - offset(), - needed_to_predownload, - available(), - current_read_range.toString(), - file_offset_of_buffer_end, - implementation_buffer->getFileOffsetOfBufferEnd(), - FileSegment::stateToString(file_segment.state()), - file_segment.getCurrentWriteOffset(false), - toString(read_type), - read_until_position, - first_offset, - file_segments->toString()); - if (size == 0 && file_offset_of_buffer_end < read_until_position) { size_t cache_file_size = getFileSizeFromReadBuffer(*implementation_buffer); @@ -1086,6 +1058,33 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep() file_segment.getInfoForLog()); } + // No necessary because of the SCOPE_EXIT above, but useful for logging below. + if (download_current_segment) + file_segment.completePartAndResetDownloader(); + + chassert(!file_segment.isDownloader()); + + LOG_TEST( + log, + "Key: {}. Returning with {} bytes, buffer position: {} (offset: {}, predownloaded: {}), " + "buffer available: {}, current range: {}, file offset of buffer end: {}, file segment state: {}, " + "current write offset: {}, read_type: {}, reading until position: {}, started with offset: {}, " + "remaining ranges: {}", + cache_key.toString(), + working_buffer.size(), + getPosition(), + offset(), + needed_to_predownload, + available(), + current_read_range.toString(), + file_offset_of_buffer_end, + FileSegment::stateToString(file_segment.state()), + file_segment.getCurrentWriteOffset(false), + toString(read_type), + read_until_position, + first_offset, + file_segments->toString()); + return result; } From 314df9a6da2ca03cfea6952b5aa47654d974666e Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 8 Jun 2023 16:25:06 +0000 Subject: [PATCH 0675/1072] Fixing test. --- .../0_stateless/02731_parallel_replicas_join_subquery.reference | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02731_parallel_replicas_join_subquery.reference b/tests/queries/0_stateless/02731_parallel_replicas_join_subquery.reference index df606679523..9b08e69b9d3 100644 --- a/tests/queries/0_stateless/02731_parallel_replicas_join_subquery.reference +++ b/tests/queries/0_stateless/02731_parallel_replicas_join_subquery.reference @@ -39,6 +39,6 @@ U c 10 UlI+1 10 bX?}ix [ Ny]2 G 10 t toUInt64(\'1610517366120\')) GROUP BY `key`, `value1`, `value2` +0 3 SELECT `key`, `value1`, `value2` FROM `default`.`join_inner_table` PREWHERE (`id` = \'833c9e22-c245-4eb5-8745-117a9a1f26b1\') AND (`number` > toUInt64(\'1610517366120\')) GROUP BY `key`, `value1`, `value2` 0 3 SELECT `value1`, `value2`, count() AS `count` FROM `default`.`join_outer_table` ALL INNER JOIN `_data_11888098645495698704_17868075224240210014` USING (`key`) GROUP BY `key`, `value1`, `value2` 1 1 -- Parallel full query\nSELECT\n value1,\n value2,\n avg(count) AS avg\nFROM\n (\n SELECT\n key,\n value1,\n value2,\n count() AS count\n FROM join_outer_table\n INNER JOIN\n (\n SELECT\n key,\n value1,\n value2,\n toUInt64(min(time)) AS start_ts\n FROM join_inner_table\n PREWHERE (id = \'833c9e22-c245-4eb5-8745-117a9a1f26b1\') AND (number > toUInt64(\'1610517366120\'))\n GROUP BY key, value1, value2\n ) USING (key)\n GROUP BY key, value1, value2\n )\nGROUP BY value1, value2\nORDER BY value1, value2\nSETTINGS allow_experimental_parallel_reading_from_replicas = 1; From 43cb2024a733fd5cbd7e0593ce317fad5e223130 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Thu, 8 Jun 2023 19:20:32 +0200 Subject: [PATCH 0676/1072] Better comments Co-authored-by: Dmitry Novik --- src/Processors/QueryPlan/Optimizations/optimizeTree.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp index 091eecf99e5..de4916797e9 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp @@ -163,6 +163,7 @@ void optimizeTreeSecondPass(const QueryPlanOptimizationSettings & optimization_s } } + /// NOTE: optimizePrewhere can modify the stack. optimizePrewhere(stack, nodes); optimizePrimaryKeyCondition(stack); enableMemoryBoundMerging(*stack.back().node, nodes); From dc08583ba492b7ec4b38c0e1fa4c38fc3310f630 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Thu, 8 Jun 2023 19:20:40 +0200 Subject: [PATCH 0677/1072] Better comments Co-authored-by: Dmitry Novik --- src/Processors/QueryPlan/Optimizations/optimizeTree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp index de4916797e9..73632d34671 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp @@ -115,7 +115,7 @@ void optimizeTreeSecondPass(const QueryPlanOptimizationSettings & optimization_s while (!stack.empty()) { { - /// NOTE: frame cannot be safely used after adding new elements to stack + /// NOTE: frame cannot be safely used after stack was modified. auto & frame = stack.back(); if (frame.next_child == 0) From 350becba5dcabe9c2cea48dc5ec6ac07033c04e2 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 8 Jun 2023 17:56:35 +0000 Subject: [PATCH 0678/1072] Fixing build. --- src/Interpreters/TableJoin.cpp | 2 +- src/Interpreters/TableJoin.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index c56d1e6039e..cabd0be1aa3 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -762,7 +762,7 @@ bool TableJoin::allowParallelHashJoin() const return true; } -ActionsDAGPtr TableJoin::createJoinedBlockActions(ContextPtr context) +ActionsDAGPtr TableJoin::createJoinedBlockActions(ContextPtr context) const { ASTPtr expression_list = rightKeysList(); auto syntax_result = TreeRewriter(context).analyze(expression_list, columnsFromJoinedTable()); diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index 096e58d1292..ba3befab59b 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -217,7 +217,7 @@ public: const SizeLimits & sizeLimits() const { return size_limits; } VolumePtr getGlobalTemporaryVolume() { return tmp_volume; } - ActionsDAGPtr createJoinedBlockActions(ContextPtr context); + ActionsDAGPtr createJoinedBlockActions(ContextPtr context) const; bool isEnabledAlgorithm(JoinAlgorithm val) const { From f34937687e6316b11fb2d61b95f818fd4828a9ce Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Thu, 8 Jun 2023 18:00:54 +0000 Subject: [PATCH 0679/1072] enable settings for mutation throttling by default --- src/Storages/MergeTree/MergeTreeSettings.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 33aea358078..a3d475b74b2 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -67,8 +67,8 @@ struct Settings; M(Bool, remove_rolled_back_parts_immediately, 1, "Setting for an incomplete experimental feature.", 0) \ M(CleanDeletedRows, clean_deleted_rows, CleanDeletedRows::Never, "Is the Replicated Merge cleanup has to be done automatically at each merge or manually (possible values are 'Always'/'Never' (default))", 0) \ M(UInt64, replicated_max_mutations_in_one_entry, 10000, "Max number of mutation commands that can be merged together and executed in one MUTATE_PART entry (0 means unlimited)", 0) \ - M(UInt64, number_of_mutations_to_delay, 0, "If table has at least that many unfinished mutations, artificially slow down mutations of table. Disabled if set to 0", 0) \ - M(UInt64, number_of_mutations_to_throw, 0, "If table has at least that many unfinished mutations, throw 'Too many mutations' exception. Disabled if set to 0", 0) \ + M(UInt64, number_of_mutations_to_delay, 500, "If table has at least that many unfinished mutations, artificially slow down mutations of table. Disabled if set to 0", 0) \ + M(UInt64, number_of_mutations_to_throw, 1000, "If table has at least that many unfinished mutations, throw 'Too many mutations' exception. Disabled if set to 0", 0) \ M(UInt64, min_delay_to_mutate_ms, 10, "Min delay of mutating MergeTree table in milliseconds, if there are a lot of unfinished mutations", 0) \ M(UInt64, max_delay_to_mutate_ms, 1000, "Max delay of mutating MergeTree table in milliseconds, if there are a lot of unfinished mutations", 0) \ \ From 172dd4af730e5f592353728eed24535b1a5ea672 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 8 Jun 2023 21:04:00 +0000 Subject: [PATCH 0680/1072] Show correct staleness value in system.query_cache --- src/Storages/System/StorageSystemQueryCache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/System/StorageSystemQueryCache.cpp b/src/Storages/System/StorageSystemQueryCache.cpp index 245f4b7fd26..3dfc5cf298a 100644 --- a/src/Storages/System/StorageSystemQueryCache.cpp +++ b/src/Storages/System/StorageSystemQueryCache.cpp @@ -47,7 +47,7 @@ void StorageSystemQueryCache::fillData(MutableColumns & res_columns, ContextPtr res_columns[0]->insert(key.queryStringFromAst()); /// approximates the original query string res_columns[1]->insert(QueryCache::QueryCacheEntryWeight()(*query_result)); res_columns[2]->insert(key.expires_at < std::chrono::system_clock::now()); - res_columns[3]->insert(!key.is_shared); + res_columns[3]->insert(key.is_shared); res_columns[4]->insert(key.is_compressed); res_columns[5]->insert(std::chrono::system_clock::to_time_t(key.expires_at)); res_columns[6]->insert(key.ast->getTreeHash().first); From a2355673d8349f4a689f46a354c335a7647a5c59 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Thu, 8 Jun 2023 21:18:29 +0000 Subject: [PATCH 0681/1072] fix tests --- tests/queries/0_stateless/02125_many_mutations.sh | 2 +- tests/queries/0_stateless/02125_many_mutations_2.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02125_many_mutations.sh b/tests/queries/0_stateless/02125_many_mutations.sh index 7a89e5f7c4f..c3108df5ae3 100755 --- a/tests/queries/0_stateless/02125_many_mutations.sh +++ b/tests/queries/0_stateless/02125_many_mutations.sh @@ -5,7 +5,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -$CLICKHOUSE_CLIENT -q "create table many_mutations (x UInt32, y UInt32) engine = MergeTree order by x" +$CLICKHOUSE_CLIENT -q "create table many_mutations (x UInt32, y UInt32) engine = MergeTree order by x settings number_of_mutations_to_delay = 0, number_of_mutations_to_throw = 0" $CLICKHOUSE_CLIENT -q "insert into many_mutations values (0, 0), (1, 1)" $CLICKHOUSE_CLIENT -q "system stop merges many_mutations" diff --git a/tests/queries/0_stateless/02125_many_mutations_2.sh b/tests/queries/0_stateless/02125_many_mutations_2.sh index df170a402c6..52866a54974 100755 --- a/tests/queries/0_stateless/02125_many_mutations_2.sh +++ b/tests/queries/0_stateless/02125_many_mutations_2.sh @@ -5,7 +5,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -$CLICKHOUSE_CLIENT -q "create table many_mutations (x UInt32, y UInt32) engine = MergeTree order by x" +$CLICKHOUSE_CLIENT -q "create table many_mutations (x UInt32, y UInt32) engine = MergeTree order by x settings number_of_mutations_to_delay = 0, number_of_mutations_to_throw = 0" $CLICKHOUSE_CLIENT -q "insert into many_mutations select number, number + 1 from numbers(2000)" $CLICKHOUSE_CLIENT -q "system stop merges many_mutations" From 7578203b46b1657373fc27a5f05e56531581916b Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Fri, 9 Jun 2023 03:44:43 +0000 Subject: [PATCH 0682/1072] Changes after review --- docs/en/interfaces/cli.md | 240 +++++++++++++++++--------------- docs/ru/interfaces/cli.md | 235 ++++++++++++++++--------------- programs/client/Client.cpp | 4 +- src/Client/ConnectionString.cpp | 49 ++++--- 4 files changed, 281 insertions(+), 247 deletions(-) diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index c36887672c7..2126c538c5d 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -158,116 +158,6 @@ $ clickhouse-client --param_tuple_in_tuple="(10, ('dt', 10))" -q "SELECT * FROM $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="number" --query "SELECT {col:Identifier} FROM {db:Identifier}.{tbl:Identifier} LIMIT 10" ``` -## Connection string {#connection_string} - -The connection string for clickhouse-client is presented in URI format: - -```text -clickhouse://[user_info@][hosts_and_ports][/dbname][?query_parameters] -``` - -where user_info is: `user[:password]` -and hosts_and_ports is a list of values: `[host][:port],[host][:port]` Port is not mandatory. -and query_parameters is a list of parameter[=value]: `param_name[=value]¶m_name[=value]...` value may not be required for some of the parameters. Parameter names are case sensitive. - -Allowed query_parameters keys: - -- `secure` or shorthanded `s` - no value. If specified, client will connect to the server over a secure connection (TLS). See `secure` in [command-line-options](#command-line-options) - -These examples illustrate valid connection strings for clickhouse-client: - -```text -clickhouse: -clickhouse://localhost -clickhouse://localhost:9000 -clickhouse://localhost/default -clickhouse://default@localhost -clickhouse://user:password@localhost -clickhouse://user_name@localhost/some_database?secure -clickhouse://host1:9000,host2:5000/some_database -``` - -The host component can either be an IP address or a host name. Put an IPv6 address in square brackets to specify it: - -```text -clickhouse://[2001:db8::1234] -``` - -If user or/and password are not specified, default values will be used. -If host is not specified, the default host will be used (localhost). -If port is not specified, the default port will be used (9000). -If database is not specified, the default database will be used. - -User, password, and database can be specified in the connection string either in `--user`, `--password`, `--database` command line options. - -The connection string must be specified in the first argument of clickhouse-client. The connection string can be combined with other [command-line-options](#command-line-options) except `--host(h)` and `--port`. - -### Multiple hosts {#connection_string_multiple_hosts} - -URI allows multiple hosts to be connected to, and the client will try to connect to those hosts using the order from URI and command line options. The hosts and ports in the URI accept comma-separated lists of values. - -If more than one host is supplied, or if a single host name is translated to more than one address, each host and address will be attempted one at a time until one is successful. The remaining hosts after successful connection in the list are not tried. - -### Percent encoding {#connection_string_uri_percent_encoding} - -Hosts, user name, password, database, and query parameters should be [Percent-Encoded](https://en.wikipedia.org/wiki/URL_encoding) if values contain invalid URI characters. - -### Examples {#connection_string_examples} - -Connect to localhost using port 9000 and execute the query "SELECT 1". - -``` bash -clickhouse-client "clickhouse://localhost:9000" --query "SELECT 1" -``` - -Connect to localhost using port 9000 in interactive, multiline mode. - -``` bash -clickhouse-client "clickhouse://localhost:9000" -m -``` - -Connect to localhost using port 9000 in interactive mode with the user specified in `--user` option. - -``` bash -clickhouse-client "clickhouse://localhost:9000" --user default -``` - -Connect to localhost using port 9000 in interactive mode to `my_database` database specified in the command line option. - -``` bash -clickhouse-client "clickhouse://localhost:9000" --database my_database -``` - -Connect to localhost using port 9000 in interactive mode to `my_database` database specified in the connection string. - -``` bash -clickhouse-client "clickhouse://localhost:9000/my_database" -``` - -Connect to localhost using port 9000 in interactive mode to `my_database` database specified in the connection string and a secure connection using shorthanded 's' URI parameter. - -```bash -clickhouse-client "clickhouse://localhost/my_database?s" -``` - -Connect to default host using default port, default user, and default database. - -``` bash -clickhouse-client "clickhouse:" -``` - -Connect to the default host using the default port, using user user_name and no password. - -``` bash -clickhouse-client "clickhouse://user_name@" -``` - -Connect to localhost using email as the user name. `@` symbol is percent encoded to `%40`. - -``` bash -clickhouse-client "clickhouse://some_user%40some_mail.com@localhost:9000" -``` - ## Configuring {#interfaces_cli_configuration} You can pass parameters to `clickhouse-client` (all parameters have a default value) using: @@ -304,7 +194,135 @@ You can pass parameters to `clickhouse-client` (all parameters have a default va - `--print-profile-events` – Print `ProfileEvents` packets. - `--profile-events-delay-ms` – Delay between printing `ProfileEvents` packets (-1 - print only totals, 0 - print every single packet). -Since version 20.5, `clickhouse-client` has automatic syntax highlighting (always enabled). +Instead of --host, --port, --user and --password options, ClickHouse client also supports connection strings. + + +## Connection string {#connection_string} + +clickhouse-client alternatively supports connecting to clickhouse server using a connection string similar to [MongoDB](https://www.mongodb.com/docs/manual/reference/connection-string/), [PostreSQL](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING), [MySQL](https://dev.mysql.com/doc/refman/8.0/en/connecting-using-uri-or-key-value-pairs.html#connecting-using-uri). It has the following syntax: + +```text +clickhouse:[//[user_info@][hosts_and_ports]][/database][?query_parameters] +``` + +Where + +- `user_spec` - (optional) is a user and an optional password, +- `hostspec` - (optional) is a list of hosts and optional ports `host[:port] [, host:[port]], ...`, +- `database` - (optional) is the database name, +- `paramspec` - (optional) is a list of key-value pairs `param1=value1[,¶m2=value2], ...`. For some parameters, no value is required. Parameter names and values are case-sensitive. + + + +The host component can either be an IP address or a host name. Put an IPv6 address in square brackets to specify it: + +```text +clickhouse://[2001:db8::1234] +``` + +If user is not specified, `default` user without password will be used. +If host is not specified, the `localhost` will be used (localhost). +If port is not specified, `9000` will be used as port. +If database is not specified, the `default` database will be used. + +If the user name, password or database was specified in the connection string, it cannot be specified using `--user`, `--password` or `--database` (and vice versa). + +The connection string must be specified in the first argument of clickhouse-client. The connection string can be combined with other [command-line-options](#command-line-options) except `--host(h)` and `--port`. + +### Multiple hosts {#connection_string_multiple_hosts} + +URI allows multiple hosts to be connected to. Connection strings can contain multiple hosts. ClickHouse-client will try to connect to these hosts in order (i.e. from left to right). After the connection is established, no attempt to connect to the remaining hosts is made. + +### Allowed query_parameters keys {#connection_string_query_parameters} + +- `secure` or shorthanded `s` - no value. If specified, client will connect to the server over a secure connection (TLS). See `secure` in [command-line-options](#command-line-options) + +### Percent encoding {#connection_string_uri_percent_encoding} + +Non-US ASCII characters in the user name, password, hosts, database or query parameters must be [percent-encoded](https://en.wikipedia.org/wiki/URL_encoding). + +### Examples {#connection_string_examples} + +Connect to localhost using port 9000 and execute the query "SELECT 1". + +``` bash +clickhouse-client clickhouse://localhost:9000 --query "SELECT 1" +``` + +Connect to localhost using user `john` with password `secret`, host `127.0.0.1` and port `9000` + +``` bash +clickhouse-client clickhouse://john:secret@127.0.0.1:9000 +``` + +Connect to localhost using default user, host with IPV6 address `[::1]` and port `9000`. + +``` bash +clickhouse-client clickhouse://[::1]:9000 +``` + +Connect to localhost using default user, host with IPV6 address `[2001:db8:3333:4444:5555:6666:7777:8888]` and port `9000`. + +``` bash +clickhouse-client clickhouse://[2001:db8:3333:4444:5555:6666:7777:8888]:9000 +``` + +Connect to localhost using port 9000 in multiline mode. + +``` bash +clickhouse-client clickhouse://localhost:9000 '-m' +``` + +Connect to localhost using port 9000 with the user `default`. + +``` bash +clickhouse-client clickhouse://default@localhost:9000 --user default + +# equivalent to: +clickhouse-client clickhouse://localhost:9000 --user default +``` + +Connect to localhost using port 9000 to `my_database` database. + +``` bash +clickhouse-client clickhouse://localhost:9000/my_database + +# equivalent to: +clickhouse-client clickhouse://localhost:9000 --database my_database +``` + +Connect to localhost using port 9000 to `my_database` database specified in the connection string and a secure connection using shorthanded 's' URI parameter. + +```bash +clickhouse-client clickhouse://localhost/my_database?s + +# equivalent to: +clickhouse-client clickhouse://localhost/my_database -s +``` + +Connect to default host using default port, default user, and default database. + +``` bash +clickhouse-client clickhouse: +``` + +Connect to the default host using the default port, using user user_name and no password. + +``` bash +clickhouse-client clickhouse://user_name@ +``` + +Connect to localhost using email as the user name. `@` symbol is percent encoded to `%40`. + +``` bash +clickhouse-client clickhouse://some_user%40some_mail.com@localhost:9000 +``` + +Connect to one of provides hosts: `192.168.1.15`, `192.168.1.25`. + +``` bash +clickhouse-client clickhouse://192.168.1.15,192.168.1.25 +``` ### Configuration Files {#configuration_files} diff --git a/docs/ru/interfaces/cli.md b/docs/ru/interfaces/cli.md index 801a72e48ec..f86ccb42356 100644 --- a/docs/ru/interfaces/cli.md +++ b/docs/ru/interfaces/cli.md @@ -110,117 +110,6 @@ $ clickhouse-client --param_tuple_in_tuple="(10, ('dt', 10))" -q "SELECT * FROM $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="number" --query "SELECT {col:Identifier} FROM {db:Identifier}.{tbl:Identifier} LIMIT 10" ``` -## Строка подключения {#connection_string} - -Строка подключения для clickhouse-client представлена в формате URI: - -```text -clickhouse://[user_info@][hosts_and_ports][/dbname][?query_parameters] -``` - -где user_info - это: `user[:password]` -hosts_and_ports - это список значений: `[host][:port],[host][:port]`. Port может быть не задан. -query_parameters - это список пар ключ[=значение]: `param_name[=value]¶m_name[=value]...`. Значение может быть пустым. -Имена параметров чувствительны к регистру. - -Допустимые ключи query_parameters: - -- `secure` или сокращенно `s` - без значение. Если параметр указан, то соединение с сервером будет осуществляться по защищенному каналу (TLS). См. `secure` в [command-line-options](#command-line-options). - -Эти примеры иллюстрируют допустимые строки подключения для clickhouse-client: - -```text -clickhouse: -clickhouse://localhost -clickhouse://localhost:9000 -clickhouse://localhost/default -clickhouse://default@localhost -clickhouse://user:password@localhost -clickhouse://имя_пользователя@localhost/some_database?secure -clickhouse://host1:9000,host2:5000/some_database -``` - -Параметр host может быть либо IP-адресом, либо именем хоста. Для указания IPv6-адреса поместите его в квадратные скобки: - -```text -clickhouse://[2001:db8::1234] -``` - -Если пользователь или/и пароль не указаны, будут использоваться значения по умолчанию. -Если host не указан, будет использован хост по умолчанию (localhost). -Если port не указан, будет использоваться порт по умолчанию (9000). -Если база данных не указана, будет использоваться база данных по умолчанию (default). - -Пользователь, пароль и база данных могут быть указаны в строке подключения либо в опциях командной строки `--user`, `--password`, `--database`. - -Строка подключения должна быть указана в первом аргументе clickhouse-client. Строка подключения может комбинироваться с другими [параметрами командной строки] (#command-line-options) кроме `--host (h)` и `--port`. - -### Несколько хостов {#connection_string_multiple_hosts} - -URI позволяет подключаться к нескольким хостам, и клиент будет пытаться подключиться к этим хостам, используя порядок из URI и опций командной строки. Хосты и порты в URI принимают списки значений, разделенные запятыми. - -Если указано более одного хоста или если одно имя хоста транслируется в несколько адресов, Клиент будет будет пытаться подключится к каждому хосту и адресу в порядке в котором они встречаются в URI И опциях клиента, пока не будет установлено соединение. Соединение разрывается, если соединение установлено и аутентификация прошла успешно, остальные хосты в списке игнорируются. - -### Кодирование URI {#connection_string_uri_percent_encoding} - -Хосты, имя пользователя, пароль, имя базы данных, и параметры запроса должны быть [закодированы](https://ru.wikipedia.org/wiki/URL#%D0%9A%D0%BE%D0%B4%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_URL), если значения содержат невалидные символы URI. - -### Примеры {#connection_string_examples} - -Подключиться к localhost через порт 9000 и выполнить запрос "SELECT 1" - -``` bash -clickhouse-client "clickhouse://localhost:9000" --query "SELECT 1" -``` - -Подключиться к localhost через порт 9000 в интерактивном, многострочном режиме. - -``` bash -clickhouse-client "clickhouse://localhost:9000" -m -``` - -Подключиться к localhost через порт 9000 в интерактивном режиме с пользователем default, указанным в опции --user. - -``` bash -clickhouse-client "clickhouse://localhost:9000" --user default -``` - -Подключиться к localhost, используя порт 9000 в интерактивном режиме с базой данных `my_database`, указанной в опции командной строки. - -``` bash -clickhouse-client "clickhouse://localhost:9000" --database my_database -``` - -Подключиться к localhost через порт 9000 в интерактивном режиме с базой данных `my_database`, указанной в строке подключения. - -``` bash -clickhouse-client "clickhouse://localhost:9000/my_database" -``` - -Подключиться к localhost через порт 9000 в интерактивном режиме с базой данных `my_database`, указанной в строке подключения, и безопасным соединением, используя короткий вариант команды URI 's'. - -``` bash -clickhouse-client "clickhouse://localhost/my_database?s" -``` - -Подключиться к хосту по умолчанию с использованием порта по умолчанию, пользователя по умолчанию, и базы данных по умолчанию. - -``` bash -clickhouse-client "clickhouse:" -``` - -Подключиться к хосту по умолчанию через порт по умолчанию, используя имя пользователя user_name без пароля. - -``` bash -clickhouse-client "clickhouse://user_name@" -``` - -Подключиться к localhost, используя электронную почту, как имя пользователя. Символ `@` закодирован как `%40`. - -``` bash -clickhouse-client "clickhouse://some_user%40some_mail.com@localhost:9000" -``` - ## Конфигурирование {#interfaces_cli_configuration} В `clickhouse-client` можно передавать различные параметры (все параметры имеют значения по умолчанию) с помощью: @@ -253,7 +142,129 @@ clickhouse-client "clickhouse://some_user%40some_mail.com@localhost:9000" - `--history_file` - путь к файлу с историей команд. - `--param_` — значение параметра для [запроса с параметрами](#cli-queries-with-parameters). -Начиная с версии 20.5, в `clickhouse-client` есть автоматическая подсветка синтаксиса (включена всегда). +## Строка подключения {#connection_string} + +clickhouse-client также поддерживает подключение к серверу clickhouse с помощью строки подключения, аналогичной [MongoDB](https://www.mongodb.com/docs/manual/reference/connection-string/), [PostreSQL](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING), [MySQL](https://dev.mysql.com/doc/refman/8.0/en/connecting-using-uri-or-key-value-pairs.html#connecting-using-uri). Она имеет следующий синтаксис: + +```text +clickhouse:[//[user_info@][hosts_and_ports]][/database][?query_parameters] +``` + +Где + +- `user_spec` - (необязательно) - это пользователь и необязательный пароль, +- `hostspec` - (необязательно) - список хостов и необязательных портов. `host[:port] [, host:[port]], ...`, +- `database` - (необязательно) - это имя базы данных, +- `paramspec` - (опционально) список пар ключ-значение `param1=value1[,¶m2=value2], ...`. Для некоторых параметров значение не требуется. Имена и значения параметров чувствительны к регистру. + +Параметр host может быть либо IP-адресом, либо именем хоста. Для указания IPv6-адреса поместите его в квадратные скобки: + +```text +clickhouse://[2001:db8::1234] +``` + +Если user не указан, будут использоваться имя пользователя `default`. +Если host не указан, будет использован хост `localhost`. +Если port не указан, будет использоваться порт `9000`. +Если база данных не указана, будет использоваться база данных `default`. + +Если имя пользователя, пароль или база данных были указаны в строке подключения, их нельзя указать с помощью `--user`, `--password` или `--database` (и наоборот). + +Строка подключения должна быть указана в первом аргументе clickhouse-client. Строка подключения может комбинироваться с другими [параметрами командной строки] (#command-line-options) кроме `--host (h)` и `--port`. + +### Несколько хостов {#connection_string_multiple_hosts} + +URI позволяет подключаться к нескольким хостам. Строки подключения могут содержать несколько хостов. ClickHouse-client будет пытаться подключиться к этим хостам по порядку (т.е. слева направо). После установления соединения попытки подключения к оставшимся хостам не предпринимаются. + +### Допустимые ключи query_parameters {#connection_string_query_parameters} + +- `secure` или сокращенно `s` - без значение. Если параметр указан, то соединение с сервером будет осуществляться по защищенному каналу (TLS). См. `secure` в [command-line-options](#command-line-options). + +### Кодирование URI {#connection_string_uri_percent_encoding} + +Не US ASCII символы в имени пользователя, пароле, хостах, базе данных или параметрах запроса должны быть [закодированы](https://ru.wikipedia.org/wiki/URL#%D0%9A%D0%BE%D0%B4%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_URL). + +### Примеры {#connection_string_examples} + +Подключиться к localhost через порт 9000 и выполнить запрос "SELECT 1" + +``` bash +clickhouse-client clickhouse://localhost:9000 --query "SELECT 1" +``` +Подключиться к localhost, используя пользователя `john` с паролем `secret`, хост `127.0.0.1` и порт `9000` + +``bash +clickhouse-client clickhouse://john:secret@127.0.0.1:9000 +``` + +Подключиться к localhost, используя пользователя по умолчанию, хост с IPV6 адресом `[::1]` и порт `9000`. + +``` bash +clickhouse-client clickhouse://[::1]:9000 +``` + +Подключиться к localhost, используя пользователя по умолчанию, хост с IPV6 адресом `[2001:db8:3333:4444:5555:6666:7777:8888]` и портом `9000`. + +`` bash +clickhouse-client clickhouse://[2001:db8:3333:4444:5555:6666:7777:8888]:9000 +``` + +Подключиться к localhost через порт 9000 многострочном режиме. + +``` bash +clickhouse-client clickhouse://localhost:9000 '-m' +``` + +Подключиться к localhost через порт 9000 с пользователем default. + +``` bash +clickhouse-client clickhouse://default@localhost:9000 + +# Эквивалетно: +clickhouse-client clickhouse://localhost:9000 --user default +``` + +Подключиться к localhost через порт 9000 с базой данных `my_database` + +``` bash +clickhouse-client clickhouse://localhost:9000/my_database + +# Эквивалетно: +clickhouse-client clickhouse://localhost:9000 --database my_database +``` + +Подключиться к localhost через порт 9000 с базой данных `my_database`, указанной в строке подключения, используя безопасным соединением при помощи короткого варианта параметра URI 's'. + +``` bash +clickhouse-client clickhouse://localhost/my_database?s + +# Эквивалетно: +clickhouse-client clickhouse://localhost/my_database -s +``` + +Подключиться к хосту по умолчанию с использованием порта по умолчанию, пользователя по умолчанию, и базы данных по умолчанию. + +``` bash +clickhouse-client clickhouse: +``` + +Подключиться к хосту по умолчанию через порт по умолчанию, используя имя пользователя user_name без пароля. + +``` bash +clickhouse-client clickhouse://user_name@ +``` + +Подключиться к localhost, используя электронную почту, как имя пользователя. Символ `@` закодирован как `%40`. + +``` bash +clickhouse-client clickhouse://some_user%40some_mail.com@localhost:9000 +``` + +Подключится к одному из хостов: `192.168.1.15`, `192.168.1.25`. + +``` bash +clickhouse-client clickhouse://192.168.1.15,192.168.1.25 +``` ### Конфигурационные файлы {#configuration_files} diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index e513314387f..1429853e333 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -1326,7 +1326,7 @@ void Client::readArguments( else if (arg.starts_with("--host") || arg.starts_with("-h")) { if (has_connection_string) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Mixing connection string and --host/--port client arguments is prohibited"); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Mixing a connection string and --host argument is prohibited"); std::string host_arg; /// --host host @@ -1360,7 +1360,7 @@ void Client::readArguments( else if (arg.starts_with("--port")) { if (has_connection_string) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Mixing connection string and --host/--port client arguments is prohibited"); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Mixing a connection string and --port argument is prohibited"); auto port_arg = String{arg}; /// --port port diff --git a/src/Client/ConnectionString.cpp b/src/Client/ConnectionString.cpp index 95fec5b52ee..2e475a1f49d 100644 --- a/src/Client/ConnectionString.cpp +++ b/src/Client/ConnectionString.cpp @@ -26,21 +26,20 @@ using namespace std::literals::string_view_literals; constexpr auto CONNECTION_URI_SCHEME = "clickhouse:"sv; -void uriDecode(std::string & uri_encoded_string, bool plus_as_space) +std::string uriDecode(const std::string & uri_encoded_string, bool plus_as_space) { - std::string temp; - Poco::URI::decode(uri_encoded_string, temp, plus_as_space); - std::swap(temp, uri_encoded_string); + std::string decoded_string; + Poco::URI::decode(uri_encoded_string, decoded_string, plus_as_space); + return decoded_string; } void getHostAndPort(const Poco::URI & uri, std::vector> & hosts_and_ports_arguments) { - auto host = uri.getHost(); std::vector host_and_port; + auto host = uri.getHost(); if (!host.empty()) { - uriDecode(host, false); - host_and_port.push_back("--host="s + host); + host_and_port.push_back("--host="s + uriDecode(host, false)); } // Port can be written without host (":9000"). Empty host name equals to default host. @@ -52,7 +51,7 @@ void getHostAndPort(const Poco::URI & uri, std::vector> hosts_and_ports_arguments.push_back(std::move(host_and_port)); } -void getHostAndPort( +void buildConnectionString( Poco::URI & uri, std::vector> & hosts_and_ports_arguments, std::string_view host_and_port, @@ -96,13 +95,13 @@ bool tryParseConnectionString( std::vector & common_arguments, std::vector> & hosts_and_ports_arguments) { + if (connection_string == CONNECTION_URI_SCHEME) + return true; + if (!connection_string.starts_with(CONNECTION_URI_SCHEME)) return false; - if (connection_string.size() == CONNECTION_URI_SCHEME.size()) - return true; - - auto offset = CONNECTION_URI_SCHEME.size(); + size_t offset = CONNECTION_URI_SCHEME.size(); if ((connection_string.substr(offset).starts_with("//"))) offset += 2; @@ -146,7 +145,7 @@ bool tryParseConnectionString( { if (*it == ',') { - getHostAndPort(uri, hosts_and_ports_arguments, {last_host_begin, it}, {hosts_end, connection_string.end()}); + buildConnectionString(uri, hosts_and_ports_arguments, {last_host_begin, it}, {hosts_end, connection_string.end()}); last_host_begin = it + 1; } } @@ -154,11 +153,11 @@ bool tryParseConnectionString( if (uri.empty()) { // URI has no host specified - uri = std::string{connection_string.begin(), connection_string.end()}; + uri = std::string(connection_string); getHostAndPort(uri, hosts_and_ports_arguments); } else - getHostAndPort(uri, hosts_and_ports_arguments, {last_host_begin, hosts_end}, {hosts_end, connection_string.end()}); + buildConnectionString(uri, hosts_and_ports_arguments, {last_host_begin, hosts_end}, {hosts_end, connection_string.end()}); Poco::URI::QueryParameters params = uri.getQueryParameters(); for (const auto & param : params) @@ -166,12 +165,12 @@ bool tryParseConnectionString( if (param.first == "secure" || param.first == "s") { if (!param.second.empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "secure URI argument does not require value"); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "secure URI query parameter does not require value"); common_arguments.push_back(makeArgument(param.first)); } else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "URI argument {} is unknown", param.first); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "URI query parameter {} is unknown", param.first); } auto user_info = uri.getUserInfo(); @@ -180,21 +179,27 @@ bool tryParseConnectionString( // Poco::URI doesn't decode user name/password by default. // But ClickHouse allows to have users with email user name like: 'john@some_mail.com' // john@some_mail.com should be percent-encoded: 'john%40some_mail.com' - uriDecode(user_info, true); std::string::size_type pos = user_info.find(':'); if (pos != std::string::npos) { common_arguments.push_back("--user"); - common_arguments.push_back(user_info.substr(0, pos)); + common_arguments.push_back(uriDecode(user_info.substr(0, pos), true)); ++pos; // Skip ':' common_arguments.push_back("--password"); - common_arguments.push_back(user_info.substr(pos)); + if (user_info.size() > pos + 1) + common_arguments.push_back(uriDecode(user_info.substr(pos), true)); + else + { + // in case of user_info == 'user:', ':' is specified, but password is empty + // then add password argument "\n" which means: Ask user for a password. + common_arguments.push_back("\n"); + } } else { common_arguments.push_back("--user"); - common_arguments.push_back(user_info); + common_arguments.push_back(uriDecode(user_info, true)); } } @@ -209,7 +214,7 @@ bool tryParseConnectionString( catch (const Poco::URISyntaxException & invalid_uri_exception) { throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, - "Invalid connection string {}: {}", connection_string, invalid_uri_exception.what()); + "Invalid connection string '{}': {}", connection_string, invalid_uri_exception.what()); } return true; From d15b7372942040863e6241f9d299b846c2513495 Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Fri, 9 Jun 2023 03:54:29 +0000 Subject: [PATCH 0683/1072] Minor renaming --- docs/en/interfaces/cli.md | 2 +- docs/ru/interfaces/cli.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index 2126c538c5d..fc24bdcad68 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -199,7 +199,7 @@ Instead of --host, --port, --user and --password options, ClickHouse client also ## Connection string {#connection_string} -clickhouse-client alternatively supports connecting to clickhouse server using a connection string similar to [MongoDB](https://www.mongodb.com/docs/manual/reference/connection-string/), [PostreSQL](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING), [MySQL](https://dev.mysql.com/doc/refman/8.0/en/connecting-using-uri-or-key-value-pairs.html#connecting-using-uri). It has the following syntax: +clickhouse-client alternatively supports connecting to clickhouse server using a connection string similar to [MongoDB](https://www.mongodb.com/docs/manual/reference/connection-string/), [PostgreSQL](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING), [MySQL](https://dev.mysql.com/doc/refman/8.0/en/connecting-using-uri-or-key-value-pairs.html#connecting-using-uri). It has the following syntax: ```text clickhouse:[//[user_info@][hosts_and_ports]][/database][?query_parameters] diff --git a/docs/ru/interfaces/cli.md b/docs/ru/interfaces/cli.md index f86ccb42356..ee29b122afb 100644 --- a/docs/ru/interfaces/cli.md +++ b/docs/ru/interfaces/cli.md @@ -144,7 +144,7 @@ $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="numbe ## Строка подключения {#connection_string} -clickhouse-client также поддерживает подключение к серверу clickhouse с помощью строки подключения, аналогичной [MongoDB](https://www.mongodb.com/docs/manual/reference/connection-string/), [PostreSQL](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING), [MySQL](https://dev.mysql.com/doc/refman/8.0/en/connecting-using-uri-or-key-value-pairs.html#connecting-using-uri). Она имеет следующий синтаксис: +clickhouse-client также поддерживает подключение к серверу clickhouse с помощью строки подключения, аналогичной [MongoDB](https://www.mongodb.com/docs/manual/reference/connection-string/), [PostgreSQL](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING), [MySQL](https://dev.mysql.com/doc/refman/8.0/en/connecting-using-uri-or-key-value-pairs.html#connecting-using-uri). Она имеет следующий синтаксис: ```text clickhouse:[//[user_info@][hosts_and_ports]][/database][?query_parameters] From 094d661701cc83d84ebcf4bbaffeff47e0f7c547 Mon Sep 17 00:00:00 2001 From: YalalovSM <39567223+YalalovSM@users.noreply.github.com> Date: Fri, 9 Jun 2023 09:26:00 +0500 Subject: [PATCH 0684/1072] Update projection.md Document using keywords IF EXISTS/IF NOT EXISTS with projections --- docs/en/sql-reference/statements/alter/projection.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/statements/alter/projection.md b/docs/en/sql-reference/statements/alter/projection.md index 030e9352a00..b7399442d41 100644 --- a/docs/en/sql-reference/statements/alter/projection.md +++ b/docs/en/sql-reference/statements/alter/projection.md @@ -142,11 +142,11 @@ The following operations with [projections](/docs/en/engines/table-engines/merge ## ADD PROJECTION -`ALTER TABLE [db].name ADD PROJECTION name ( SELECT [GROUP BY] [ORDER BY] )` - Adds projection description to tables metadata. +`ALTER TABLE [db].name ADD PROJECTION [IF NOT EXISTS] name ( SELECT [GROUP BY] [ORDER BY] )` - Adds projection description to tables metadata. ## DROP PROJECTION -`ALTER TABLE [db].name DROP PROJECTION name` - Removes projection description from tables metadata and deletes projection files from disk. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). +`ALTER TABLE [db].name DROP PROJECTION [IF EXISTS] name` - Removes projection description from tables metadata and deletes projection files from disk. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). ## MATERIALIZE PROJECTION @@ -154,7 +154,7 @@ The following operations with [projections](/docs/en/engines/table-engines/merge ## CLEAR PROJECTION -`ALTER TABLE [db.]table CLEAR PROJECTION name IN PARTITION partition_name` - Deletes projection files from disk without removing description. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). +`ALTER TABLE [db.]table CLEAR PROJECTION [IF EXISTS] name IN PARTITION partition_name` - Deletes projection files from disk without removing description. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). The commands `ADD`, `DROP` and `CLEAR` are lightweight in a sense that they only change metadata or remove files. From 95c0b942c141493f8997fb807b4ff72c34b8292b Mon Sep 17 00:00:00 2001 From: YalalovSM <39567223+YalalovSM@users.noreply.github.com> Date: Fri, 9 Jun 2023 10:37:20 +0500 Subject: [PATCH 0685/1072] Update projection.md --- docs/ru/sql-reference/statements/alter/projection.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/ru/sql-reference/statements/alter/projection.md b/docs/ru/sql-reference/statements/alter/projection.md index 63b068611ec..33e52b93add 100644 --- a/docs/ru/sql-reference/statements/alter/projection.md +++ b/docs/ru/sql-reference/statements/alter/projection.md @@ -8,13 +8,13 @@ sidebar_label: PROJECTION Доступны следующие операции с [проекциями](../../../engines/table-engines/mergetree-family/mergetree.md#projections): -- `ALTER TABLE [db].name ADD PROJECTION name ( SELECT [GROUP BY] [ORDER BY] )` — добавляет описание проекции в метаданные. +- `ALTER TABLE [db].name ADD PROJECTION [IF NOT EXISTS] name ( SELECT [GROUP BY] [ORDER BY] )` — добавляет описание проекции в метаданные. -- `ALTER TABLE [db].name DROP PROJECTION name` — удаляет описание проекции из метаданных и удаляет файлы проекции с диска. +- `ALTER TABLE [db].name DROP PROJECTION [IF EXISTS] name` — удаляет описание проекции из метаданных и удаляет файлы проекции с диска. - `ALTER TABLE [db.]table MATERIALIZE PROJECTION name IN PARTITION partition_name` — перестраивает проекцию в указанной партиции. Реализовано как [мутация](../../../sql-reference/statements/alter/index.md#mutations). -- `ALTER TABLE [db.]table CLEAR PROJECTION name IN PARTITION partition_name` — удаляет файлы проекции с диска без удаления описания. +- `ALTER TABLE [db.]table CLEAR PROJECTION [IF EXISTS] name IN PARTITION partition_name` — удаляет файлы проекции с диска без удаления описания. Команды `ADD`, `DROP` и `CLEAR` — легковесны, поскольку они только меняют метаданные или удаляют файлы. @@ -22,4 +22,4 @@ sidebar_label: PROJECTION :::note Манипуляции с проекциями поддерживаются только для таблиц с движком [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) (включая [replicated](../../../engines/table-engines/mergetree-family/replication.md) варианты). - ::: \ No newline at end of file + ::: From d0938e95e68551c51f54407abe30bbd35534bf2e Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Fri, 9 Jun 2023 06:40:57 +0000 Subject: [PATCH 0686/1072] prohibited to use --connection --- programs/client/Client.cpp | 9 +++------ src/Client/ConnectionString.cpp | 19 +++++++++++++++++++ src/Client/ConnectionString.h | 5 +++++ 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 1429853e333..a49447dff69 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -1267,6 +1267,9 @@ void Client::readArguments( { std::string_view arg = argv[arg_num]; + if (has_connection_string) + validateConnectionStringClientOption(arg); + if (arg == "--external") { in_external_group = true; @@ -1325,9 +1328,6 @@ void Client::readArguments( } else if (arg.starts_with("--host") || arg.starts_with("-h")) { - if (has_connection_string) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Mixing a connection string and --host argument is prohibited"); - std::string host_arg; /// --host host if (arg == "--host" || arg == "-h") @@ -1359,9 +1359,6 @@ void Client::readArguments( } else if (arg.starts_with("--port")) { - if (has_connection_string) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Mixing a connection string and --port argument is prohibited"); - auto port_arg = String{arg}; /// --port port if (arg == "--port") diff --git a/src/Client/ConnectionString.cpp b/src/Client/ConnectionString.cpp index 2e475a1f49d..b9658772e2e 100644 --- a/src/Client/ConnectionString.cpp +++ b/src/Client/ConnectionString.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB { @@ -26,6 +27,15 @@ using namespace std::literals::string_view_literals; constexpr auto CONNECTION_URI_SCHEME = "clickhouse:"sv; +const std::unordered_map PROHIBITED_CLIENT_OPTIONS = +{ + /// Client option, client option long name + {"-h", "--host"}, + {"--host", "--host"}, + {"--port", "--port"}, + {"--connection", "--connection"}, +}; + std::string uriDecode(const std::string & uri_encoded_string, bool plus_as_space) { std::string decoded_string; @@ -220,4 +230,13 @@ bool tryParseConnectionString( return true; } +void validateConnectionStringClientOption(std::string_view command_line_option) +{ + const auto prohibited_option_iter = PROHIBITED_CLIENT_OPTIONS.find(command_line_option); + if (prohibited_option_iter != PROHIBITED_CLIENT_OPTIONS.end()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Mixing a connection string and {} option is prohibited", + prohibited_option_iter->second); +} + } diff --git a/src/Client/ConnectionString.h b/src/Client/ConnectionString.h index aafb1139b00..ce72de9edf6 100644 --- a/src/Client/ConnectionString.h +++ b/src/Client/ConnectionString.h @@ -19,4 +19,9 @@ bool tryParseConnectionString( std::string_view connection_string, std::vector & common_arguments, std::vector> & hosts_and_ports_arguments); + +// throws DB::Exception with BAD_ARGUMENTS if the given command line argument is allowed +// to be used with the connection string +void validateConnectionStringClientOption(std::string_view command_line_option); + } From b8fc25ab239fbf8c82f589b175e386ceed737c26 Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Fri, 9 Jun 2023 06:51:34 +0000 Subject: [PATCH 0687/1072] minor update --- src/Client/ConnectionString.cpp | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/Client/ConnectionString.cpp b/src/Client/ConnectionString.cpp index b9658772e2e..e1f39369b2a 100644 --- a/src/Client/ConnectionString.cpp +++ b/src/Client/ConnectionString.cpp @@ -27,13 +27,12 @@ using namespace std::literals::string_view_literals; constexpr auto CONNECTION_URI_SCHEME = "clickhouse:"sv; -const std::unordered_map PROHIBITED_CLIENT_OPTIONS = -{ - /// Client option, client option long name - {"-h", "--host"}, - {"--host", "--host"}, - {"--port", "--port"}, - {"--connection", "--connection"}, +const std::unordered_map PROHIBITED_CLIENT_OPTIONS = { + /// Client option, client option long name + {"-h", "--host"}, + {"--host", "--host"}, + {"--port", "--port"}, + {"--connection", "--connection"}, }; std::string uriDecode(const std::string & uri_encoded_string, bool plus_as_space) @@ -234,9 +233,8 @@ void validateConnectionStringClientOption(std::string_view command_line_option) { const auto prohibited_option_iter = PROHIBITED_CLIENT_OPTIONS.find(command_line_option); if (prohibited_option_iter != PROHIBITED_CLIENT_OPTIONS.end()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Mixing a connection string and {} option is prohibited", - prohibited_option_iter->second); + throw Exception( + ErrorCodes::BAD_ARGUMENTS, "Mixing a connection string and {} option is prohibited", prohibited_option_iter->second); } } From 96d9b88a201a3ee1b79874e60becc0fc62a10ba3 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 9 Jun 2023 10:48:13 +0200 Subject: [PATCH 0688/1072] Fix build --- contrib/azure-cmake/CMakeLists.txt | 2 +- src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/contrib/azure-cmake/CMakeLists.txt b/contrib/azure-cmake/CMakeLists.txt index 1e2a4c97824..9c361db47ca 100644 --- a/contrib/azure-cmake/CMakeLists.txt +++ b/contrib/azure-cmake/CMakeLists.txt @@ -1,6 +1,6 @@ option (ENABLE_AZURE_BLOB_STORAGE "Enable Azure blob storage" ${ENABLE_LIBRARIES}) -if (NOT ENABLE_AZURE_BLOB_STORAGE OR BUILD_STANDALONE_KEEPER OR OS_FREEBSD) +if (NOT ENABLE_AZURE_BLOB_STORAGE OR BUILD_STANDALONE_KEEPER OR OS_FREEBSD OR ARCH_PPC64LE) message(STATUS "Not using Azure blob storage") return() endif() diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index b3cda54e752..5a34adb384a 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -8,10 +8,7 @@ #include #include #include - -#if USE_AZURE_BLOB_STORAGE #include -#endif namespace Poco { From 5c76a8882ef644ca924c5653cdad14e7cfa94270 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Fri, 9 Jun 2023 09:12:07 +0000 Subject: [PATCH 0689/1072] Fix docs by pull request comments --- .../functions/type-conversion-functions.md | 177 +++++++++++----- .../functions/type-conversion-functions.md | 199 ++++++++++++++---- 2 files changed, 288 insertions(+), 88 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index a021fed195d..c634a3da27e 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -33,7 +33,7 @@ SELECT toTypeName(toNullable('') AS val) AS source_type, toTypeName(toString(val)) AS to_type_result_type, toTypeName(CAST(val, 'String')) AS cast_result_type - + ┌─source_type──────┬─to_type_result_type─┬─cast_result_type─┐ │ Nullable(String) │ Nullable(String) │ String │ └──────────────────┴─────────────────────┴──────────────────┘ @@ -203,7 +203,7 @@ Result: ## toDate -Converts the argument to [Date](/docs/en/sql-reference/data-types/date.md) data type. +Converts the argument to [Date](/docs/en/sql-reference/data-types/date.md) data type. If the argument is [DateTime](/docs/en/sql-reference/data-types/datetime.md) or [DateTime64](/docs/en/sql-reference/data-types/datetime64.md), it truncates it and leaves the date component of the DateTime: @@ -232,7 +232,7 @@ SELECT │ 2022-12-30 │ Date │ └────────────┴──────────────────────────────────┘ -1 row in set. Elapsed: 0.001 sec. +1 row in set. Elapsed: 0.001 sec. ``` ```sql @@ -314,14 +314,52 @@ SELECT └─────────────────────┴───────────────┴─────────────┴─────────────────────┘ ``` + ## toDateOrZero +The same as [toDate](#todate) but returns lower boundery of [Date](/docs/en/sql-reference/data-types/date.md) if an invalid argument is received. Only [String](/docs/en/sql-reference/data-types/string.md) argument is supported. + +**Example** + +Query: + +``` sql +SELECT toDateOrZero('2022-12-30'), toDateOrZero(''); +``` + +Result: + +```response +┌─toDateOrZero('2022-12-30')─┬─toDateOrZero('')─┐ +│ 2022-12-30 │ 1970-01-01 │ +└────────────────────────────┴──────────────────┘ +``` + + ## toDateOrNull +The same as [toDate](#todate) but returns `NULL` if an invalid argument is received. Only [String](/docs/en/sql-reference/data-types/string.md) argument is supported. + +**Example** + +Query: + +``` sql +SELECT toDateOrNull('2022-12-30'), toDateOrNull(''); +``` + +Result: + +```response +┌─toDateOrNull('2022-12-30')─┬─toDateOrNull('')─┐ +│ 2022-12-30 │ ᴺᵁᴸᴸ │ +└────────────────────────────┴──────────────────┘ +``` + + ## toDateOrDefault -Converts an input value to [Date](/docs/en/sql-reference/data-types/date.md) data type. -If unsuccessful, returns the lower border value supported by [Date](/docs/en/sql-reference/data-types/date.md). The default value can be specified as a second argument. -Similar to [toDate](#todate). + +Like [toDate](#todate) but if unsuccessful, returns a default value which is either the second argument (if specified), or otherwise the lower boundery of [Date](/docs/en/sql-reference/data-types/date.md). **Syntax** @@ -329,62 +367,37 @@ Similar to [toDate](#todate). toDateOrDefault(expr [, default_value]) ``` -**Arguments** - -- `expr` — The value. [String](/docs/en/sql-reference/data-types/string.md), [Int](/docs/en/sql-reference/data-types/int-uint.md), [Date](/docs/en/sql-reference/data-types/date.md) or [DateTime](/docs/en/sql-reference/data-types/datetime.md). -- `default_value` — The default value. [Date](/docs/en/sql-reference/data-types/date.md) - -If `expr` is a number and looks like a UNIX timestamp (is greater than 65535), it is interpreted as a DateTime, then truncated to Date in the current timezone. If `expr` is a number and it is smaller than 65536, it is interpreted as the number of days since 1970-01-01. - -**Returned value** - -- A calendar date. [Date](/docs/en/sql-reference/data-types/date.md) - **Example** Query: ``` sql -SELECT - toDateOrDefault('2021-01-01', '2023-01-01'::Date), - toDateOrDefault('xx2021-01-01', '2023-01-01'::Date); +SELECT toDateOrDefault('2022-12-30'), toDateOrDefault('', '2023-01-01'::Date); ``` Result: ```response -┌─toDateOrDefault('2021-01-01', CAST('2023-01-01', 'Date'))─┬─toDateOrDefault('xx2021-01-01', CAST('2023-01-01', 'Date'))─┐ -│ 2021-01-01 │ 2023-01-01 │ -└───────────────────────────────────────────────────────────┴─────────────────────────────────────────────────────────────┘ +┌─toDateOrDefault('2022-12-30')─┬─toDateOrDefault('', CAST('2023-01-01', 'Date'))─┐ +│ 2022-12-30 │ 2023-01-01 │ +└───────────────────────────────┴─────────────────────────────────────────────────┘ ``` -**See Also** -- [toDate](#todate) -- [toDate32OrDefault](#todate32ordefault) - ## toDateTime -## toDateTimeOrZero - -## toDateTimeOrNull - -## toDateTimeOrDefault -Converts an input value to [DateTime](/docs/en/sql-reference/data-types/datetime.md) data type. -If unsuccessful, returns the lower border value supported by [DateTime](/docs/en/sql-reference/data-types/datetime.md). The default value can be specified as a third argument. -Similar to [toDateTime](#todatetime). +Converts an input value to [DateTime](/docs/en/sql-reference/data-types/datetime.md). **Syntax** ``` sql -toDateTimeOrDefault(expr, [, time_zone [, default_value]]) +toDateTime(expr[, time_zone ]) ``` **Arguments** -- `expr` — The value. [String](/docs/en/sql-reference/data-types/string.md), [Int](/docs/en/sql-reference/data-types/int-uint.md), [Date](/docs/en/sql-reference/data-types/date.md) or [DateTime](/docs/en/sql-reference/data-types/datetime.md). +- `expr` — The value. [String](/docs/en/sql-reference/data-types/string.md), [Int](/docs/en/sql-reference/data-types/int-uint.md), [Date](/docs/en/sql-reference/data-types/date.md) or [DateTime](/docs/en/sql-reference/data-types/datetime.md). - `time_zone` — Time zone. [String](/docs/en/sql-reference/data-types/string.md). -- `default_value` — The default value. [DateTime](/docs/en/sql-reference/data-types/datetime.md) If `expr` is a number, it is interpreted as the number of seconds since the beginning of the Unix Epoch (as Unix timestamp). @@ -397,21 +410,86 @@ If `expr` is a number, it is interpreted as the number of seconds since the begi Query: ``` sql -SELECT - toDateTimeOrDefault('2021-01-01', 'UTC', '2023-01-01'::DateTime('UTC')), - toDateTimeOrDefault('xx2021-01-01', 'UTC', '2023-01-01'::DateTime('UTC')); +SELECT toDateTime('2022-12-30 13:44:17'), toDateTime(1685457500, 'UTC'); ``` Result: ```response -┌─toDateTimeOrDefault('2021-01-01', 'UTC', CAST('2023-01-01', 'DateTime(\'UTC\')'))─┬─toDateTimeOrDefault('xx2021-01-01', 'UTC', CAST('2023-01-01', 'DateTime(\'UTC\')'))─┐ -│ 2021-01-01 00:00:00 │ 2023-01-01 00:00:00 │ -└───────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────────────────────────────────────────────────┘ +┌─toDateTime('2022-12-30 13:44:17')─┬─toDateTime(1685457500, 'UTC')─┐ +│ 2022-12-30 13:44:17 │ 2023-05-30 14:38:20 │ +└───────────────────────────────────┴───────────────────────────────┘ +``` + + +## toDateTimeOrZero + +The same as [toDateTime](#todate) but returns lower boundery of [Date](/docs/en/sql-reference/data-types/date.md) if an invalid argument is received. Only [String](/docs/en/sql-reference/data-types/string.md) argument is supported. + +**Example** + +Query: + +``` sql +SELECT toDateTimeOrZero('2022-12-30 13:44:17'), toDateTimeOrZero(''); +``` + +Result: + +```response +┌─toDateTimeOrZero('2022-12-30 13:44:17')─┬─toDateTimeOrZero('')─┐ +│ 2022-12-30 13:44:17 │ 1970-01-01 00:00:00 │ +└─────────────────────────────────────────┴──────────────────────┘ +``` + + +## toDateTimeOrNull + +The same as [toDateTime](#todatetime) but returns `NULL` if an invalid argument is received. Only [String](/docs/en/sql-reference/data-types/string.md) argument is supported. + +**Example** + +Query: + +``` sql +SELECT toDateTimeOrNull('2022-12-30 13:44:17'), toDateTimeOrNull(''); +``` + +Result: + +```response +┌─toDateTimeOrNull('2022-12-30 13:44:17')─┬─toDateTimeOrNull('')─┐ +│ 2022-12-30 13:44:17 │ ᴺᵁᴸᴸ │ +└─────────────────────────────────────────┴──────────────────────┘ +``` + + +## toDateTimeOrDefault + +Like [toDateTime](#todatetime) but if unsuccessful, returns a default value which is either the third argument (if specified), or otherwise the lower boundery of [DateTime](/docs/en/sql-reference/data-types/datetime.md). + +**Syntax** + +``` sql +toDateTimeOrDefault(expr [, time_zone [, default_value]]) +``` + +**Example** + +Query: + +``` sql +SELECT toDateTimeOrDefault('2022-12-30 13:44:17'), toDateTimeOrDefault('', 'UTC', '2023-01-01'::DateTime('UTC')); +``` + +Result: + +```response +┌─toDateTimeOrDefault('2022-12-30 13:44:17')─┬─toDateTimeOrDefault('', 'UTC', CAST('2023-01-01', 'DateTime(\'UTC\')'))─┐ +│ 2022-12-30 13:44:17 │ 2023-01-01 00:00:00 │ +└────────────────────────────────────────────┴─────────────────────────────────────────────────────────────────────────┘ ``` -**See Also** -- [toDateTime](#todatetime) ## toDate32 @@ -604,6 +682,11 @@ SELECT toDateTime64('2019-01-01 00:00:00', 3, 'Asia/Istanbul') AS value, toTypeN └─────────────────────────┴─────────────────────────────────────────────────────────────────────┘ ``` +## toDateTime64OrZero + +## toDateTime64OrNull + +## toDateTime64OrDefault ## toDecimal(32\|64\|128\|256) @@ -1332,7 +1415,7 @@ Returns DateTime values parsed from input string according to a MySQL style form **Supported format specifiers** All format specifiers listed in [formatDateTime](/docs/en/sql-reference/functions/date-time-functions.md#date_time_functions-formatDateTime) except: -- %Q: Quarter (1-4) +- %Q: Quarter (1-4) **Example** diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 67d1732d34e..d43b5415114 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -165,28 +165,17 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); ## toDate {#todate} -Cиноним: `DATE`. - -## toDateOrZero {#todateorzero} - -## toDateOrNull {#todateornull} - -## toDateOrDefault {#todateordefault} - -Конвертирует аргумент в значение [Date](/docs/ru/sql-reference/data-types/date.md) data type. -Если получен недопустимый аргумент, то возвращает значение по умолчанию (нижняя граница [Date](/docs/ru/sql-reference/data-types/date.md). Значение по умолчанию может быть указано вторым аргументом. -Похожа на [toDate](#todate). +Конвертирует аргумент в значение [Date](/docs/ru/sql-reference/data-types/date.md). **Синтаксис** ``` sql -toDateOrDefault(expr [, default_value]) +toDate(expr) ``` **Аргументы** -- `expr` — Значение для преобразования. [String](/docs/ru/sql-reference/data-types/string.md), [Int](/docs/ru/sql-reference/data-types/int-uint.md), [Date](/docs/ru/sql-reference/data-types/date.md) или [DateTime](/docs/ru/sql-reference/data-types/datetime.md). -- `default_value` — Значение по умолчанию. [Date](/docs/ru/sql-reference/data-types/date.md) +- `expr` — Значение для преобразования. [String](/docs/ru/sql-reference/data-types/string.md), [Int](/docs/ru/sql-reference/data-types/int-uint.md), [Date](/docs/ru/sql-reference/data-types/date.md) или [DateTime](/docs/ru/sql-reference/data-types/datetime.md). Если `expr` является числом выглядит как UNIX timestamp (больше чем 65535), оно интерпретируется как DateTime, затем обрезается до Date учитывавая текущую часовой пояс. Если `expr` является числом и меньше чем 65536, оно интерпретируется как количество дней с 1970-01-01. @@ -199,46 +188,101 @@ toDateOrDefault(expr [, default_value]) Запрос: ``` sql -SELECT - toDateOrDefault('2021-01-01', '2023-01-01'::Date), - toDateOrDefault('xx2021-01-01', '2023-01-01'::Date); +SELECT toDate('2022-12-30'), toDate(1685457500); ``` Результат: ```response -┌─toDateOrDefault('2021-01-01', CAST('2023-01-01', 'Date'))─┬─toDateOrDefault('xx2021-01-01', CAST('2023-01-01', 'Date'))─┐ -│ 2021-01-01 │ 2023-01-01 │ -└───────────────────────────────────────────────────────────┴─────────────────────────────────────────────────────────────┘ +┌─toDate('2022-12-30')─┬─toDate(1685457500)─┐ +│ 2022-12-30 │ 2023-05-30 │ +└──────────────────────┴────────────────────┘ ``` -**Смотрите также** -- [toDate](#todate) -- [toDate32OrDefault](#todate32ordefault) -## toDateTime {#todatetime} +## toDateOrZero {#todateorzero} -## toDateTimeOrZero {#todatetimeorzero} +Как [toDate](#todate), но в случае неудачи возвращает нижнюю границу [Date](/docs/ru/sql-reference/data-types/date.md)). Поддерживается только аргумент типа [String](/docs/ru/sql-reference/data-types/string.md). -## toDateTimeOrNull {#todatetimeornull} +**Пример** -## toDateTimeOrDefault {#todatetimeordefault} +Запрос: -Конвертирует аргумент в значение [DateTime](/docs/ru/sql-reference/data-types/datetime.md). -Если получен недопустимый аргумент, то возвращает значение по умолчанию (нижняя граница [DateTime](/docs/ru/sql-reference/data-types/datetime.md)). Значение по умолчанию может быть указано третьим аргументом. -Похожа на [toDateTime](#todatetime). +``` sql +SELECT toDateOrZero('2022-12-30'), toDateOrZero(''); +``` + +Результат: + +```response +┌─toDateOrZero('2022-12-30')─┬─toDateOrZero('')─┐ +│ 2022-12-30 │ 1970-01-01 │ +└────────────────────────────┴──────────────────┘ +``` + + +## toDateOrNull {#todateornull} + +Как [toDate](#todate), но в случае неудачи возвращает `NULL`. Поддерживается только аргумент типа [String](/docs/ru/sql-reference/data-types/string.md). + +**Пример** + +Запрос: + +``` sql +SELECT toDateOrNull('2022-12-30'), toDateOrNull(''); +``` + +Результат: + +```response +┌─toDateOrNull('2022-12-30')─┬─toDateOrNull('')─┐ +│ 2022-12-30 │ ᴺᵁᴸᴸ │ +└────────────────────────────┴──────────────────┘ +``` + + +## toDateOrDefault {#todateordefault} + +Как [toDate](#todate), но в случае неудачи возвращает значение по умолчанию (или второй аргумент (если указан), или нижняя граница [Date](/docs/ru/sql-reference/data-types/date.md)). **Синтаксис** ``` sql -toDateTimeOrDefault(expr, [, time_zone [, default_value]]) +toDateOrDefault(expr [, default_value]) +``` + +**Пример** + +Запрос: + +``` sql +SELECT toDateOrDefault('2022-12-30'), toDateOrDefault('', '2023-01-01'::Date); +``` + +Результат: + +```response +┌─toDateOrDefault('2022-12-30')─┬─toDateOrDefault('', CAST('2023-01-01', 'Date'))─┐ +│ 2022-12-30 │ 2023-01-01 │ +└───────────────────────────────┴─────────────────────────────────────────────────┘ +``` + + +## toDateTime {#todatetime} + +Конвертирует аргумент в значение [DateTime](/docs/ru/sql-reference/data-types/datetime.md). + +**Синтаксис** + +``` sql +toDateTime(expr[, time_zone ]) ``` **Аргументы** -- `expr` — Значение для преобразования. [String](/docs/ru/sql-reference/data-types/string.md), [Int](/docs/ru/sql-reference/data-types/int-uint.md), [Date](/docs/ru/sql-reference/data-types/date.md) или [DateTime](/docs/ru/sql-reference/data-types/datetime.md). +- `expr` — Значение для преобразования. [String](/docs/ru/sql-reference/data-types/string.md), [Int](/docs/ru/sql-reference/data-types/int-uint.md), [Date](/docs/ru/sql-reference/data-types/date.md) или [DateTime](/docs/ru/sql-reference/data-types/datetime.md). - `time_zone` — Часовой пояс. [String](/docs/ru/sql-reference/data-types/string.md). -- `default_value` — Значение по умолчанию. [DateTime](/docs/ru/sql-reference/data-types/datetime.md) Если `expr` является числом, оно интерпретируется как количество секунд от начала unix эпохи. @@ -251,21 +295,86 @@ toDateTimeOrDefault(expr, [, time_zone [, default_value]]) Запрос: ``` sql -SELECT - toDateTimeOrDefault('2021-01-01', 'UTC', '2023-01-01'::DateTime('UTC')), - toDateTimeOrDefault('xx2021-01-01', 'UTC', '2023-01-01'::DateTime('UTC')); +SELECT toDateTime('2022-12-30 13:44:17'), toDateTime(1685457500, 'UTC'); ``` Результат: ```response -┌─toDateTimeOrDefault('2021-01-01', 'UTC', CAST('2023-01-01', 'DateTime(\'UTC\')'))─┬─toDateTimeOrDefault('xx2021-01-01', 'UTC', CAST('2023-01-01', 'DateTime(\'UTC\')'))─┐ -│ 2021-01-01 00:00:00 │ 2023-01-01 00:00:00 │ -└───────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────────────────────────────────────────────────┘ +┌─toDateTime('2022-12-30 13:44:17')─┬─toDateTime(1685457500, 'UTC')─┐ +│ 2022-12-30 13:44:17 │ 2023-05-30 14:38:20 │ +└───────────────────────────────────┴───────────────────────────────┘ +``` + + +## toDateTimeOrZero {#todatetimeorzero} + +Как [toDateTime](#todatetime), но в случае неудачи возвращает нижнюю границу [DateTime](/docs/ru/sql-reference/data-types/datetime.md)). Поддерживается только аргумент типа [String](/docs/ru/sql-reference/data-types/string.md). + +**Пример** + +Запрос: + +``` sql +SELECT toDateTimeOrZero('2022-12-30 13:44:17'), toDateTimeOrZero(''); +``` + +Результат: + +```response +┌─toDateTimeOrZero('2022-12-30 13:44:17')─┬─toDateTimeOrZero('')─┐ +│ 2022-12-30 13:44:17 │ 1970-01-01 00:00:00 │ +└─────────────────────────────────────────┴──────────────────────┘ +``` + + +## toDateTimeOrNull {#todatetimeornull} + +Как [toDateTime](#todatetime), но в случае неудачи возвращает `NULL`. Поддерживается только аргумент типа [String](/docs/ru/sql-reference/data-types/string.md). + +**Example** + +Query: + +``` sql +SELECT toDateTimeOrNull('2022-12-30 13:44:17'), toDateTimeOrNull(''); +``` + +Result: + +```response +┌─toDateTimeOrNull('2022-12-30 13:44:17')─┬─toDateTimeOrNull('')─┐ +│ 2022-12-30 13:44:17 │ ᴺᵁᴸᴸ │ +└─────────────────────────────────────────┴──────────────────────┘ +``` + + +## toDateTimeOrDefault {#todatetimeordefault} + +Как [toDateTime](#todatetime), но в случае неудачи возвращает значение по умолчанию (или третий аргумент (если указан), или нижняя граница [DateTime](/docs/ru/sql-reference/data-types/datetime.md)). + +**Синтаксис** + +``` sql +toDateTimeOrDefault(expr, [, time_zone [, default_value]]) +``` + +**Пример** + +Запрос: + +``` sql +SELECT toDateTimeOrDefault('2022-12-30 13:44:17'), toDateTimeOrDefault('', 'UTC', '2023-01-01'::DateTime('UTC')); +``` + +Результат: + +```response +┌─toDateTimeOrDefault('2022-12-30 13:44:17')─┬─toDateTimeOrDefault('', 'UTC', CAST('2023-01-01', 'DateTime(\'UTC\')'))─┐ +│ 2022-12-30 13:44:17 │ 2023-01-01 00:00:00 │ +└────────────────────────────────────────────┴─────────────────────────────────────────────────────────────────────────┘ ``` -**Смотрите также** -- [toDateTime](#todatetime) ## toDate32 {#todate32} @@ -387,6 +496,14 @@ SELECT └─────────────────────────────────────────────────────────┴───────────────────────────────────────────────────────────┘ ``` +## toDateTime64 + +## toDateTime64OrZero + +## toDateTime64OrNull + +## toDateTime64OrDefault + ## toDecimal(32\|64\|128\|256) {#todecimal3264128} Преобразует `value` к типу данных [Decimal](../../sql-reference/functions/type-conversion-functions.md) с точностью `S`. `value` может быть числом или строкой. Параметр `S` (scale) задаёт число десятичных знаков. From 7ac4349d969819b59555a47d42e92ae070c2faa7 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 9 Jun 2023 11:21:10 +0200 Subject: [PATCH 0690/1072] Hacking azure function --- .../table-functions/azure_blob_storage.md | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 docs/en/sql-reference/table-functions/azure_blob_storage.md diff --git a/docs/en/sql-reference/table-functions/azure_blob_storage.md b/docs/en/sql-reference/table-functions/azure_blob_storage.md new file mode 100644 index 00000000000..f86307b3b85 --- /dev/null +++ b/docs/en/sql-reference/table-functions/azure_blob_storage.md @@ -0,0 +1,11 @@ +--- +slug: /en/sql-reference/table-functions/azure_blob_storage +sidebar_position: 45 +sidebar_label: azure_blob_storage +keywords: [azure blob storage] +--- + +# azure\_blob\_storage Table Function + +Provides a table-like interface to select/insert files in [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs). This table function is similar to the [s3 function](../../sql-reference/table-functions/s3.md). + From ab8365630b3ee50120e67664ca4ecbab1afcc4c3 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 9 Jun 2023 09:19:21 +0000 Subject: [PATCH 0691/1072] Reject logs containing unknown operation --- contrib/NuRaft | 2 +- src/Coordination/Changelog.cpp | 3 +- src/Coordination/Changelog.h | 2 +- src/Coordination/KeeperServer.cpp | 39 +++++++++++++++++++++---- src/Coordination/KeeperStateMachine.cpp | 5 ++-- 5 files changed, 39 insertions(+), 12 deletions(-) diff --git a/contrib/NuRaft b/contrib/NuRaft index b56784be1ae..f43d10dbc97 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit b56784be1aec568fb72aff47f281097c017623cb +Subproject commit f43d10dbc977a63f11dfb3afdd010fcf7ad89950 diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 894fd93cfa7..c0dfbc2cbc3 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -13,6 +13,7 @@ #include #include #include +#include namespace DB @@ -479,7 +480,7 @@ public: continue; /// Create log entry for read data - auto log_entry = nuraft::cs_new(record.header.term, record.blob, record.header.value_type); + auto log_entry = nuraft::cs_new(record.header.term, record.blob, static_cast(record.header.value_type)); if (result.first_read_index == 0) result.first_read_index = record.header.index; diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 56b0475ba8b..3c09370182d 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -39,7 +39,7 @@ struct ChangelogRecordHeader ChangelogVersion version = CURRENT_CHANGELOG_VERSION; uint64_t index = 0; /// entry log number uint64_t term = 0; - nuraft::log_val_type value_type{}; + int32_t value_type{}; uint64_t blob_size = 0; }; diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 45db9e85fa5..6e47412cd3a 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -607,12 +608,30 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ } } + const auto follower_preappend = [&](const auto & entry) + { + if (entry->get_val_type() != nuraft::app_log) + return nuraft::cb_func::ReturnCode::Ok; + + try + { + state_machine->parseRequest(entry->get_buf(), /*final=*/false); + } + catch (...) + { + tryLogCurrentException(log, "Failed to parse request from log entry"); + throw; + } + return nuraft::cb_func::ReturnCode::Ok; + + }; + if (initialized_flag) { switch (type) { // This event is called before a single log is appended to the entry on the leader node - case nuraft::cb_func::PreAppendLog: + case nuraft::cb_func::PreAppendLogLeader: { // we are relying on the fact that request are being processed under a mutex // and not a RW lock @@ -665,7 +684,12 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ if (request_for_session->digest->version != KeeperStorage::NO_DIGEST) writeIntBinary(request_for_session->digest->value, write_buf); - break; + return nuraft::cb_func::ReturnCode::Ok; + } + case nuraft::cb_func::PreAppendLogFollower: + { + const auto & entry = *static_cast(param->ctx); + return follower_preappend(entry); } case nuraft::cb_func::AppendLogFailed: { @@ -678,13 +702,11 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ auto & entry_buf = entry->get_buf(); auto request_for_session = state_machine->parseRequest(entry_buf, true); state_machine->rollbackRequest(*request_for_session, true); - break; + return nuraft::cb_func::ReturnCode::Ok; } default: - break; + return nuraft::cb_func::ReturnCode::Ok; } - - return nuraft::cb_func::ReturnCode::Ok; } size_t last_commited = state_machine->last_commit_index(); @@ -737,6 +759,11 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ initial_batch_committed = true; return nuraft::cb_func::ReturnCode::Ok; } + case nuraft::cb_func::PreAppendLogFollower: + { + const auto & entry = *static_cast(param->ctx); + return follower_preappend(entry); + } default: /// ignore other events return nuraft::cb_func::ReturnCode::Ok; } diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index 65abee44050..7d251ad48b9 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -272,9 +272,8 @@ bool KeeperStateMachine::preprocess(const KeeperStorage::RequestForSession & req } catch (...) { - tryLogCurrentException(__PRETTY_FUNCTION__); - rollbackRequestNoLock(request_for_session, true); - throw; + tryLogCurrentException(__PRETTY_FUNCTION__, "Failed to preprocess stored log, aborting to avoid inconsistent state"); + std::abort(); } if (keeper_context->digest_enabled && request_for_session.digest) From de70e322cf93d5f10a01bbc7d7aa8f4798755214 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Fri, 9 Jun 2023 10:29:44 +0000 Subject: [PATCH 0692/1072] Fix by pull request comments --- src/Functions/DateTimeTransforms.h | 35 ++++++++--------- src/Functions/FunctionsConversion.h | 2 +- .../01556_accurate_cast_or_null.reference | 2 + .../01556_accurate_cast_or_null.sql | 2 + .../0_stateless/01601_accurate_cast.sql | 38 +++++++++---------- 5 files changed, 40 insertions(+), 39 deletions(-) diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index 09b0d71daf8..1d3ec1bd368 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -1436,7 +1436,7 @@ struct Transformer { template static void vector(const FromTypeVector & vec_from, ToTypeVector & vec_to, const DateLUTImpl & time_zone, const Transform & transform, - ColumnUInt8::Container * vec_null_map_to [[maybe_unused]]) + [[maybe_unused]] ColumnUInt8::Container * vec_null_map_to) { using ValueType = typename ToTypeVector::value_type; size_t size = vec_from.size(); @@ -1444,29 +1444,26 @@ struct Transformer for (size_t i = 0; i < size; ++i) { - if constexpr (std::is_same_v - || std::is_same_v) + if constexpr (std::is_same_v || std::is_same_v) { - bool check_range_result = true; - - if constexpr (std::is_same_v || std::is_same_v) + if constexpr (std::is_same_v + || std::is_same_v) { - check_range_result = vec_from[i] >= 0 && vec_from[i] <= 0xFFFFFFFFL; - } + bool is_valid_input = vec_from[i] >= 0 && vec_from[i] <= 0xFFFFFFFFL; - if (!check_range_result) - { - if constexpr (std::is_same_v) + if (!is_valid_input) { - vec_to[i] = 0; - if (vec_null_map_to) + if constexpr (std::is_same_v) + { + vec_to[i] = 0; (*vec_null_map_to)[i] = true; - continue; - } - else - { - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Value in column {} cannot be safely converted into type {}", - TypeName, TypeName); + continue; + } + else + { + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Value in column {} cannot be safely converted into type {}", + TypeName, TypeName); + } } } } diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 3a8ddcc9094..ea8efada21d 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -2885,7 +2885,7 @@ private: if constexpr (IsDataTypeNumber && (std::is_same_v || std::is_same_v)) { - if (wrapper_cast_type == CastType::accurate) + if (wrapper_cast_type == CastType::accurate) { result_column = ConvertImpl::template execute( arguments, result_type, input_rows_count); diff --git a/tests/queries/0_stateless/01556_accurate_cast_or_null.reference b/tests/queries/0_stateless/01556_accurate_cast_or_null.reference index 21faa830636..3bff125068a 100644 --- a/tests/queries/0_stateless/01556_accurate_cast_or_null.reference +++ b/tests/queries/0_stateless/01556_accurate_cast_or_null.reference @@ -35,6 +35,8 @@ \N 2023-05-30 14:38:20 1970-01-01 00:00:19 +1970-01-01 19:26:40 +\N \N \N 2023-05-30 diff --git a/tests/queries/0_stateless/01556_accurate_cast_or_null.sql b/tests/queries/0_stateless/01556_accurate_cast_or_null.sql index 3f57358576e..3266198d930 100644 --- a/tests/queries/0_stateless/01556_accurate_cast_or_null.sql +++ b/tests/queries/0_stateless/01556_accurate_cast_or_null.sql @@ -41,8 +41,10 @@ SELECT accurateCastOrNull(5000000000, 'DateTime'); SELECT accurateCastOrNull('1xxx', 'DateTime'); select toString(accurateCastOrNull('2023-05-30 14:38:20', 'DateTime'), timezone()); SELECT toString(accurateCastOrNull(19, 'DateTime'), 'UTC'); +SELECT toString(accurateCastOrNull(70000, 'DateTime'), 'UTC'); SELECT accurateCastOrNull(-1, 'Date'); +SELECT accurateCastOrNull(5000000000, 'Date'); SELECT accurateCastOrNull('1xxx', 'Date'); SELECT accurateCastOrNull('2023-05-30', 'Date'); SELECT accurateCastOrNull(19, 'Date'); diff --git a/tests/queries/0_stateless/01601_accurate_cast.sql b/tests/queries/0_stateless/01601_accurate_cast.sql index 5555129f0ad..2108e42df05 100644 --- a/tests/queries/0_stateless/01601_accurate_cast.sql +++ b/tests/queries/0_stateless/01601_accurate_cast.sql @@ -1,36 +1,36 @@ -SELECT accurateCast(-1, 'UInt8'); -- { serverError 70 } +SELECT accurateCast(-1, 'UInt8'); -- { serverError CANNOT_CONVERT_TYPE } SELECT accurateCast(5, 'UInt8'); -SELECT accurateCast(257, 'UInt8'); -- { serverError 70 } -SELECT accurateCast(-1, 'UInt16'); -- { serverError 70 } +SELECT accurateCast(257, 'UInt8'); -- { serverError CANNOT_CONVERT_TYPE } +SELECT accurateCast(-1, 'UInt16'); -- { serverError CANNOT_CONVERT_TYPE } SELECT accurateCast(5, 'UInt16'); -SELECT accurateCast(65536, 'UInt16'); -- { serverError 70 } -SELECT accurateCast(-1, 'UInt32'); -- { serverError 70 } +SELECT accurateCast(65536, 'UInt16'); -- { serverError CANNOT_CONVERT_TYPE } +SELECT accurateCast(-1, 'UInt32'); -- { serverError CANNOT_CONVERT_TYPE } SELECT accurateCast(5, 'UInt32'); -SELECT accurateCast(4294967296, 'UInt32'); -- { serverError 70 } -SELECT accurateCast(-1, 'UInt64'); -- { serverError 70 } +SELECT accurateCast(4294967296, 'UInt32'); -- { serverError CANNOT_CONVERT_TYPE } +SELECT accurateCast(-1, 'UInt64'); -- { serverError CANNOT_CONVERT_TYPE } SELECT accurateCast(5, 'UInt64'); -SELECT accurateCast(-1, 'UInt256'); -- { serverError 70 } +SELECT accurateCast(-1, 'UInt256'); -- { serverError CANNOT_CONVERT_TYPE } SELECT accurateCast(5, 'UInt256'); -SELECT accurateCast(-129, 'Int8'); -- { serverError 70 } +SELECT accurateCast(-129, 'Int8'); -- { serverError CANNOT_CONVERT_TYPE } SELECT accurateCast(5, 'Int8'); -SELECT accurateCast(128, 'Int8'); -- { serverError 70 } +SELECT accurateCast(128, 'Int8'); -- { serverError CANNOT_CONVERT_TYPE } -SELECT accurateCast(10, 'Decimal32(9)'); -- { serverError 407 } +SELECT accurateCast(10, 'Decimal32(9)'); -- { serverError DECIMAL_OVERFLOW } SELECT accurateCast(1, 'Decimal32(9)'); -SELECT accurateCast(-10, 'Decimal32(9)'); -- { serverError 407 } +SELECT accurateCast(-10, 'Decimal32(9)'); -- { serverError DECIMAL_OVERFLOW } -SELECT accurateCast('123', 'FixedString(2)'); -- { serverError 131 } +SELECT accurateCast('123', 'FixedString(2)'); -- { serverError TOO_LARGE_STRING_SIZE } SELECT accurateCast('12', 'FixedString(2)'); -SELECT accurateCast(-1, 'DateTime'); -- { serverError 70 } -SELECT accurateCast(0xFFFFFFFF + 1, 'DateTime'); -- { serverError 70 } -SELECT accurateCast('1xxx', 'DateTime'); -- { serverError 41 } +SELECT accurateCast(-1, 'DateTime'); -- { serverError CANNOT_CONVERT_TYPE } +SELECT accurateCast(0xFFFFFFFF + 1, 'DateTime'); -- { serverError CANNOT_CONVERT_TYPE } +SELECT accurateCast('1xxx', 'DateTime'); -- { serverError CANNOT_PARSE_DATETIME } SELECT accurateCast('2023-05-30 14:38:20', 'DateTime'); SELECT toString(accurateCast(19, 'DateTime'), 'UTC'); -SELECT accurateCast(-1, 'Date'); -- { serverError 70 } -SELECT accurateCast(0xFFFFFFFF + 1, 'Date'); -- { serverError 70 } -SELECT accurateCast('1xxx', 'Date'); -- { serverError 38 } +SELECT accurateCast(-1, 'Date'); -- { serverError CANNOT_CONVERT_TYPE } +SELECT accurateCast(0xFFFFFFFF + 1, 'Date'); -- { serverError CANNOT_CONVERT_TYPE } +SELECT accurateCast('1xxx', 'Date'); -- { serverError CANNOT_PARSE_DATE } SELECT accurateCast('2023-05-30', 'Date'); SELECT accurateCast(19, 'Date'); From 7a02a70ad4239e920c6e23b9bc2bcc0a5c5db58b Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Fri, 9 Jun 2023 11:11:49 +0000 Subject: [PATCH 0693/1072] Add value to exceptions text --- src/Functions/DateTimeTransforms.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index 1d3ec1bd368..019e0c42cde 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -1461,8 +1461,8 @@ struct Transformer } else { - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Value in column {} cannot be safely converted into type {}", - TypeName, TypeName); + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Value {} cannot be safely converted into type {}", + vec_from[i], TypeName); } } } From 16a6190446e7cfc35bfc13f1077c6f32ba430184 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 9 Jun 2023 13:25:50 +0200 Subject: [PATCH 0694/1072] More strict build Aarch64 --- contrib/azure-cmake/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/azure-cmake/CMakeLists.txt b/contrib/azure-cmake/CMakeLists.txt index 9c361db47ca..968882dbfcd 100644 --- a/contrib/azure-cmake/CMakeLists.txt +++ b/contrib/azure-cmake/CMakeLists.txt @@ -1,6 +1,6 @@ option (ENABLE_AZURE_BLOB_STORAGE "Enable Azure blob storage" ${ENABLE_LIBRARIES}) -if (NOT ENABLE_AZURE_BLOB_STORAGE OR BUILD_STANDALONE_KEEPER OR OS_FREEBSD OR ARCH_PPC64LE) +if (NOT ENABLE_AZURE_BLOB_STORAGE OR BUILD_STANDALONE_KEEPER OR OS_FREEBSD OR NOT ARCH_AMD64) message(STATUS "Not using Azure blob storage") return() endif() From 430a19bb80e497d8c4f5d02af604d78a3d8016c5 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 9 Jun 2023 13:26:53 +0200 Subject: [PATCH 0695/1072] Better --- contrib/azure-cmake/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/azure-cmake/CMakeLists.txt b/contrib/azure-cmake/CMakeLists.txt index 968882dbfcd..887122e7653 100644 --- a/contrib/azure-cmake/CMakeLists.txt +++ b/contrib/azure-cmake/CMakeLists.txt @@ -1,6 +1,6 @@ option (ENABLE_AZURE_BLOB_STORAGE "Enable Azure blob storage" ${ENABLE_LIBRARIES}) -if (NOT ENABLE_AZURE_BLOB_STORAGE OR BUILD_STANDALONE_KEEPER OR OS_FREEBSD OR NOT ARCH_AMD64) +if (NOT ENABLE_AZURE_BLOB_STORAGE OR BUILD_STANDALONE_KEEPER OR OS_FREEBSD OR (NOT ARCH_AMD64)) message(STATUS "Not using Azure blob storage") return() endif() From f437d5d8b51b69347eecb79a0af878bfd2707d9d Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Fri, 9 Jun 2023 11:27:27 +0000 Subject: [PATCH 0696/1072] Fix toDateTimeOrZero description --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index c634a3da27e..28db7e6e677 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -424,7 +424,7 @@ Result: ## toDateTimeOrZero -The same as [toDateTime](#todate) but returns lower boundery of [Date](/docs/en/sql-reference/data-types/date.md) if an invalid argument is received. Only [String](/docs/en/sql-reference/data-types/string.md) argument is supported. +The same as [toDateTime](#todatetime) but returns lower boundery of [DateTime](/docs/en/sql-reference/data-types/datetime.md) if an invalid argument is received. Only [String](/docs/en/sql-reference/data-types/string.md) argument is supported. **Example** From 8f9c74debb002b74180ad534e828714b49bba44a Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 9 Jun 2023 14:31:49 +0300 Subject: [PATCH 0697/1072] Support OPTIMIZE for temporary tables (#50710) * Update InterpreterOptimizeQuery.cpp * Update 02525_different_engines_in_temporary_tables.sql * Update MergeTreeData.cpp --- src/Interpreters/InterpreterOptimizeQuery.cpp | 2 +- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- .../02525_different_engines_in_temporary_tables.sql | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Interpreters/InterpreterOptimizeQuery.cpp b/src/Interpreters/InterpreterOptimizeQuery.cpp index a4507391c4e..ae456e8b31d 100644 --- a/src/Interpreters/InterpreterOptimizeQuery.cpp +++ b/src/Interpreters/InterpreterOptimizeQuery.cpp @@ -34,7 +34,7 @@ BlockIO InterpreterOptimizeQuery::execute() getContext()->checkAccess(getRequiredAccess()); - auto table_id = getContext()->resolveStorageID(ast, Context::ResolveOrdinary); + auto table_id = getContext()->resolveStorageID(ast); StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext()); checkStorageSupportsTransactionsIfNeeded(table, getContext()); auto metadata_snapshot = table->getInMemoryMetadataPtr(); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 2f6870f8b41..9cca471fddb 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -4789,7 +4789,7 @@ void MergeTreeData::checkAlterPartitionIsPossible( if (partition_ast && partition_ast->all) { if (command.type != PartitionCommand::DROP_PARTITION) - throw DB::Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Only support DETACH PARTITION ALL currently"); + throw DB::Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Only support DROP/DETACH PARTITION ALL currently"); } else getPartitionIDFromQuery(command.partition, getContext()); diff --git a/tests/queries/0_stateless/02525_different_engines_in_temporary_tables.sql b/tests/queries/0_stateless/02525_different_engines_in_temporary_tables.sql index 7ebc05dfece..58e9ecab30c 100644 --- a/tests/queries/0_stateless/02525_different_engines_in_temporary_tables.sql +++ b/tests/queries/0_stateless/02525_different_engines_in_temporary_tables.sql @@ -7,7 +7,9 @@ CREATE TEMPORARY TABLE table_merge_tree_02525 ENGINE = MergeTree ORDER BY id PRIMARY KEY id; -INSERT INTO table_merge_tree_02525 VALUES (1, 'a'), (2, 'b'), (3, 'c'); +INSERT INTO table_merge_tree_02525 VALUES (1, 'a'), (2, 'b'); +INSERT INTO table_merge_tree_02525 VALUES (3, 'c'); +OPTIMIZE TABLE table_merge_tree_02525 FINAL; SELECT * FROM table_merge_tree_02525; -- Check that temporary table with MergeTree is not sent to remote servers -- The query with remote() should not fail From ad74189bc2ed4039b0cf129928141e13f6db435b Mon Sep 17 00:00:00 2001 From: flynn Date: Fri, 9 Jun 2023 19:32:45 +0800 Subject: [PATCH 0698/1072] Fix Log family table return wrong rows count after truncate (#50585) * Fix Log family table return wrong rows count after truncate * fix test * update test * update test --- src/Storages/StorageLog.cpp | 2 ++ src/Storages/StorageStripeLog.cpp | 2 ++ .../02771_log_faminy_truncate_count.reference | 2 ++ .../02771_log_faminy_truncate_count.sql | 26 +++++++++++++++++++ 4 files changed, 32 insertions(+) create mode 100644 tests/queries/0_stateless/02771_log_faminy_truncate_count.reference create mode 100644 tests/queries/0_stateless/02771_log_faminy_truncate_count.sql diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index ac68de43332..d8065b8bb3c 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -775,6 +775,8 @@ void StorageLog::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr marks_loaded = true; num_marks_saved = 0; + total_rows = 0; + total_bytes = 0; getContext()->dropMMappedFileCache(); } diff --git a/src/Storages/StorageStripeLog.cpp b/src/Storages/StorageStripeLog.cpp index 5c704d877d1..d8bbd523cbf 100644 --- a/src/Storages/StorageStripeLog.cpp +++ b/src/Storages/StorageStripeLog.cpp @@ -424,6 +424,8 @@ void StorageStripeLog::truncate(const ASTPtr &, const StorageMetadataPtr &, Cont indices_loaded = true; num_indices_saved = 0; + total_rows = 0; + total_bytes = 0; getContext()->dropMMappedFileCache(); } diff --git a/tests/queries/0_stateless/02771_log_faminy_truncate_count.reference b/tests/queries/0_stateless/02771_log_faminy_truncate_count.reference new file mode 100644 index 00000000000..aa47d0d46d4 --- /dev/null +++ b/tests/queries/0_stateless/02771_log_faminy_truncate_count.reference @@ -0,0 +1,2 @@ +0 +0 diff --git a/tests/queries/0_stateless/02771_log_faminy_truncate_count.sql b/tests/queries/0_stateless/02771_log_faminy_truncate_count.sql new file mode 100644 index 00000000000..3fb22837f5b --- /dev/null +++ b/tests/queries/0_stateless/02771_log_faminy_truncate_count.sql @@ -0,0 +1,26 @@ +DROP TABLE IF EXISTS test_log; +CREATE TABLE test_log +( + `crypto_name` String, + `trade_date` Date +) +ENGINE = Log; + +INSERT INTO test_log (crypto_name, trade_date) VALUES ('abc', '2021-01-01'), ('def', '2022-02-02'); + +TRUNCATE TABLE test_log; +SELECT count() FROM test_log; + +DROP TABLE IF EXISTS test_log; +CREATE TABLE test_log +( + `crypto_name` String, + `trade_date` Date +) +ENGINE = StripeLog; + +INSERT INTO test_log (crypto_name, trade_date) VALUES ('abc', '2021-01-01'), ('def', '2022-02-02'); + +TRUNCATE TABLE test_log; +SELECT count() FROM test_log; +DROP TABLE test_log; From bc2af59278bdd0addeda2bdaedff411117f23f04 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 9 Jun 2023 13:38:30 +0200 Subject: [PATCH 0699/1072] Update autogenerated version to 23.6.1.1 and contributors --- cmake/autogenerated_versions.txt | 10 ++-- .../StorageSystemContributors.generated.cpp | 49 +++++++++++++++++++ 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt index 462529fbc13..015037b2de6 100644 --- a/cmake/autogenerated_versions.txt +++ b/cmake/autogenerated_versions.txt @@ -2,11 +2,11 @@ # NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION, # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes. -SET(VERSION_REVISION 54474) +SET(VERSION_REVISION 54475) SET(VERSION_MAJOR 23) -SET(VERSION_MINOR 5) +SET(VERSION_MINOR 6) SET(VERSION_PATCH 1) -SET(VERSION_GITHASH 3920eb987f7ed837ada5de8907284adf123f0583) -SET(VERSION_DESCRIBE v23.5.1.1-testing) -SET(VERSION_STRING 23.5.1.1) +SET(VERSION_GITHASH 2fec796e73efda10a538a03af3205ce8ffa1b2de) +SET(VERSION_DESCRIBE v23.6.1.1-testing) +SET(VERSION_STRING 23.6.1.1) # end of autochange diff --git a/src/Storages/System/StorageSystemContributors.generated.cpp b/src/Storages/System/StorageSystemContributors.generated.cpp index 0f307650c9c..f83ee3197fe 100644 --- a/src/Storages/System/StorageSystemContributors.generated.cpp +++ b/src/Storages/System/StorageSystemContributors.generated.cpp @@ -83,6 +83,8 @@ const char * auto_contributors[] { "Alexey Boykov", "Alexey Dushechkin", "Alexey Elymanov", + "Alexey Gerasimchuck", + "Alexey Gerasimchuk", "Alexey Gusev", "Alexey Ilyukhov", "Alexey Ivanov", @@ -143,6 +145,7 @@ const char * auto_contributors[] { "Anmol Arora", "Anna", "Anna Shakhova", + "AnneClickHouse", "Anselmo D. Adams", "Anthony N. Simon", "Anton Ivashkin", @@ -305,6 +308,7 @@ const char * auto_contributors[] { "Dr. Strange Looker", "Duc Canh Le", "DuckSoft", + "Duyet Le", "Egor O'Sten", "Egor Savin", "Eirik", @@ -321,6 +325,7 @@ const char * auto_contributors[] { "Eric", "Eric Daniel", "Eric Thomas", + "Eridanus", "Erixonich", "Ernest Poletaev", "Eugene Galkin", @@ -347,6 +352,7 @@ const char * auto_contributors[] { "Federico Ceratto", "Federico Rodriguez", "FeehanG", + "Feng Kaiyu", "FgoDt", "Filatenkov Artur", "Filipe Caixeta", @@ -444,6 +450,7 @@ const char * auto_contributors[] { "Ivan Milov", "Ivan Remen", "Ivan Starkov", + "Ivan Takarlikov", "Ivan Zhukov", "Jachen Duschletta", "Jack Song", @@ -477,6 +484,7 @@ const char * auto_contributors[] { "John Skopis", "Jonatas Freitas", "Jonathan-Ackerman", + "Jordi", "Jordi Villar", "Joris Giovannangeli", "Jose", @@ -489,6 +497,7 @@ const char * auto_contributors[] { "Jus", "Justin Hilliard", "Justin de Guzman", + "János Benjamin Antal", "Kang Liu", "Karl Pietrzak", "Keiji Yoshida", @@ -523,6 +532,7 @@ const char * auto_contributors[] { "Kruglov Pavel", "Krzysztof Góralski", "Kseniia Sumarokova", + "Kuba Kaflik", "Kunal Gurnani", "Kuz Le", "Ky Li", @@ -540,6 +550,7 @@ const char * auto_contributors[] { "Leopold Schabel", "Lev Borodin", "Lewinma", + "Li Shuai", "Li Yin", "Liu Cong", "LiuCong", @@ -549,12 +560,14 @@ const char * auto_contributors[] { "Lopatin Konstantin", "Lorenzo Mangani", "Loud_Scream", + "Lucas Chang", "Lucid Dreams", "Luck-Chang", "Luis Bosque", "Lv Feng", "Léo Ercolanelli", "M0r64n", + "M1eyu2018", "MEX7", "MaceWindu", "MagiaGroz", @@ -564,8 +577,10 @@ const char * auto_contributors[] { "Maksim Fedotov", "Maksim Kita", "Maksym Sobolyev", + "Mal Curtis", "Mallik Hassan", "Malte", + "Manas Alekar", "Manuel de la Peña", "Marat IDRISOV", "Marcelo Rodriguez", @@ -650,7 +665,9 @@ const char * auto_contributors[] { "Milad Arabi", "Mingliang Pan", "Misko Lee", + "Misz606", "Mohamad Fadhil", + "Mohammad Arab Anvari", "Mohammad Hossein Sekhavat", "Mojtaba Yaghoobzadeh", "Mostafa Dahab", @@ -764,6 +781,7 @@ const char * auto_contributors[] { "Rajkumar Varada", "Ramazan Polat", "Rami Dridi", + "Raqbit", "Ravengg", "Raúl Marín", "Realist007", @@ -791,6 +809,7 @@ const char * auto_contributors[] { "Roman Peshkurov", "Roman Tsisyk", "Roman Vasin", + "Roman Vlasenko", "Roman Zhukov", "Roy Bellingan", "Ruslan", @@ -825,6 +844,8 @@ const char * auto_contributors[] { "Sergey Demurin", "Sergey Elantsev", "Sergey Fedorov", + "Sergey Kazmin", + "Sergey Kislov", "Sergey Kononenko", "Sergey Lazarev", "Sergey Magidovich", @@ -840,6 +861,7 @@ const char * auto_contributors[] { "Sergio Tulentsev", "SevaCode", "Seyed Mehrshad Hosseini", + "Shane Andrade", "Sherry Wang", "Shoh Jahon", "Sichen Zhao", @@ -857,6 +879,8 @@ const char * auto_contributors[] { "SmitaRKulkarni", "Snow", "Sofia Antipushina", + "Sorck", + "Stanislav Dobrovolschii", "Stanislav Pavlovichev", "Stas Kelvich", "Stas Pavlovichev", @@ -922,6 +946,8 @@ const char * auto_contributors[] { "Vadym Chekan", "Vage Ogannisian", "Val", + "Val Doroshchuk", + "Valentin Alexeev", "Valera Ryaboshapko", "Varinara", "Vasily Kozhukhovskiy", @@ -935,8 +961,10 @@ const char * auto_contributors[] { "Veselkov Konstantin", "Viachaslau Boben", "Victor", + "Victor Krasnov", "Victor Tarnavsky", "Viktor Taranenko", + "Vincent", "Vincent Bernat", "Vitalii S", "Vitaliy", @@ -1018,12 +1046,14 @@ const char * auto_contributors[] { "Yuriy Korzhenevskiy", "Yury Karpovich", "Yury Stankevich", + "Yusuke Tanaka", "ZhiYong Wang", "Zhichang Yu", "Zhichun Wu", "Zhiguo Zhou", "Zhipeng", "Zijie Lu", + "Ziy1-Tan", "Zoran Pandovski", "[데이터플랫폼팀] 이호선", "a.palagashvili", @@ -1039,6 +1069,7 @@ const char * auto_contributors[] { "akazz", "akonyaev", "akuzm", + "alekar", "alekseik1", "alekseygolub", "alesapin", @@ -1072,6 +1103,7 @@ const char * auto_contributors[] { "asiana21", "atereh", "attack204", + "auxten", "avasiliev", "avogar", "avoiderboi", @@ -1094,6 +1126,7 @@ const char * auto_contributors[] { "caipengxiang", "candiduslynx", "canenoneko", + "cangyin", "caspian", "cekc", "centos7", @@ -1131,6 +1164,7 @@ const char * auto_contributors[] { "damozhaeva", "dankondr", "daoready", + "darkkeks", "dasmfm", "davydovska", "decaseal", @@ -1189,6 +1223,7 @@ const char * auto_contributors[] { "franklee", "fredchenbj", "freedomDR", + "frinkr", "fuqi", "fuwhu", "fuzhe1989", @@ -1236,6 +1271,7 @@ const char * auto_contributors[] { "ikopylov", "imgbot[bot]", "ip", + "ismailakpolat", "it1804", "ivan-klass", "ivan-kush", @@ -1255,6 +1291,7 @@ const char * auto_contributors[] { "jianmei zhang", "jinjunzh", "jkuklis", + "johanngan", "jthmath", "jun won", "jus1096", @@ -1280,6 +1317,7 @@ const char * auto_contributors[] { "kst-morozov", "l", "l1tsolaiki", + "laimuxi", "lalex", "lanfz", "larryluogit", @@ -1298,8 +1336,10 @@ const char * auto_contributors[] { "liang.huang", "liangqian", "libenwang", + "libin", "lichengxiang", "liding1992", + "lihaibo42", "linceyou", "lincion", "lingo-xp", @@ -1341,6 +1381,7 @@ const char * auto_contributors[] { "mastertheknife", "mateng0915", "mateng915", + "mauidude", "maxim", "maxim-babenko", "maxkuzn", @@ -1352,6 +1393,7 @@ const char * auto_contributors[] { "meo", "meoww-bot", "mergify[bot]", + "merlllle", "metahys", "mf5137", "mfridental", @@ -1398,6 +1440,7 @@ const char * auto_contributors[] { "olevino", "olevino999", "olgarev", + "ongkong", "orantius", "p0ny", "palasonicq", @@ -1473,6 +1516,7 @@ const char * auto_contributors[] { "spongedc", "spume", "spyros87", + "sslouis", "stan", "stavrolia", "stepenhu", @@ -1518,6 +1562,7 @@ const char * auto_contributors[] { "vicdashkov", "vicgao", "vinity", + "vitac", "vitstn", "vivarum", "vladimir golovchenko", @@ -1527,6 +1572,7 @@ const char * auto_contributors[] { "vzakaznikov", "wangchao", "wangdh15", + "wangxiaobo", "weeds085490", "whysage", "wineternity", @@ -1537,6 +1583,7 @@ const char * auto_contributors[] { "xieyichen", "xinhuitian", "xlwh", + "xmy", "yakkomajuri", "yakov-olkhovskiy", "yandd", @@ -1589,6 +1636,7 @@ const char * auto_contributors[] { "zvvr", "zxc111", "zxealous", + "zy-kkk", "zzsmdfj", "Šimon Podlipský", "Александр", @@ -1605,6 +1653,7 @@ const char * auto_contributors[] { "Сундуков Алексей", "万康", "何李夫", + "你不要过来啊", "凌涛", "刘浩林", "刘陶峰", From 2d220bd8411607521fb6a2767b568352a2cae127 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 9 Jun 2023 11:38:50 +0000 Subject: [PATCH 0700/1072] Fix NuRaft --- contrib/NuRaft | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/NuRaft b/contrib/NuRaft index f43d10dbc97..8f267da1a91 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit f43d10dbc977a63f11dfb3afdd010fcf7ad89950 +Subproject commit 8f267da1a91310bd152af755b0178cfd38c646c7 From 32d781c058321b3c2ee0b21b17a2cbb87a9a3e23 Mon Sep 17 00:00:00 2001 From: Victor Krasnov Date: Fri, 9 Jun 2023 14:51:26 +0300 Subject: [PATCH 0701/1072] Align the documentation with the new feature --- docs/en/sql-reference/functions/type-conversion-functions.md | 4 +++- docs/ru/sql-reference/functions/type-conversion-functions.md | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index f6c99b168ac..dad3cfb4cc5 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1343,6 +1343,7 @@ parseDateTimeBestEffort(time_string [, time_zone]) - A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `DD/MM/YYYY`, `DD-MM-YY` etc. - A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case `YYYY-MM` are substituted as `2000-01`. - A string that includes the date and time along with time zone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. +- A string that includes the date and time in a [syslog](https://datatracker.ietf.org/doc/html/rfc3164) datetime format. For example, `Jun 9 14:20:32`. For all of the formats with separator the function parses months names expressed by their full name or by the first three letters of a month name. Examples: `24/DEC/18`, `24-Dec-18`, `01-September-2018`. @@ -1428,10 +1429,11 @@ Result: **See Also** -- [RFC 1123](https://tools.ietf.org/html/rfc1123) +- [RFC 1123](https://datatracker.ietf.org/doc/html/rfc1123) - [toDate](#todate) - [toDateTime](#todatetime) - [ISO 8601 announcement by @xkcd](https://xkcd.com/1179/) +- [RFC 3164](https://datatracker.ietf.org/doc/html/rfc3164) ## parseDateTimeBestEffortUS diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 298b7bbc93e..03e3adfbdca 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -1022,6 +1022,7 @@ parseDateTimeBestEffort(time_string[, time_zone]) - Строка с датой, но без времени: `YYYY`, `YYYYMM`, `YYYY*MM`, `DD/MM/YYYY`, `DD-MM-YY` и т.д. - Строка с временем, и с днём: `DD`, `DD hh`, `DD hh:mm`. В этом случае `YYYY-MM` принимается равным `2000-01`. - Строка, содержащая дату и время вместе с информацией о часовом поясе: `YYYY-MM-DD hh:mm:ss ±h:mm`, и т.д. Например, `2020-12-12 17:36:00 -5:00`. +- Строка, содержащая дату и время в формате [syslog](https://datatracker.ietf.org/doc/html/rfc3164). Например, `Jun 9 14:20:32`. Для всех форматов с разделителями функция распознаёт названия месяцев, выраженных в виде полного англоязычного имени месяца или в виде первых трёх символов имени месяца. Примеры: `24/DEC/18`, `24-Dec-18`, `01-September-2018`. @@ -1108,9 +1109,10 @@ SELECT parseDateTimeBestEffort('10 20:19'); **Смотрите также** - [Информация о формате ISO 8601 от @xkcd](https://xkcd.com/1179/) -- [RFC 1123](https://tools.ietf.org/html/rfc1123) +- [RFC 1123](https://datatracker.ietf.org/doc/html/rfc1123) - [toDate](#todate) - [toDateTime](#todatetime) +- [RFC 3164](https://datatracker.ietf.org/doc/html/rfc3164) ## parseDateTimeBestEffortUS {#parsedatetimebesteffortUS} From 61a20468f7429866c3125bd7c55627072de7ea5d Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Fri, 9 Jun 2023 11:53:29 +0000 Subject: [PATCH 0702/1072] Update version_date.tsv and changelogs after v23.5.1.3174-stable --- SECURITY.md | 3 +- docker/keeper/Dockerfile | 2 +- docker/server/Dockerfile.alpine | 2 +- docker/server/Dockerfile.ubuntu | 2 +- docs/changelogs/v23.5.1.3174-stable.md | 599 +++++++++++++++++++++++++ utils/list-versions/version_date.tsv | 1 + 6 files changed, 605 insertions(+), 4 deletions(-) create mode 100644 docs/changelogs/v23.5.1.3174-stable.md diff --git a/SECURITY.md b/SECURITY.md index 75c1a9d7d6a..1864eb6e9e5 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -13,9 +13,10 @@ The following versions of ClickHouse server are currently being supported with s | Version | Supported | |:-|:-| +| 23.5 | ✔️ | | 23.4 | ✔️ | | 23.3 | ✔️ | -| 23.2 | ✔️ | +| 23.2 | ❌ | | 23.1 | ❌ | | 22.12 | ❌ | | 22.11 | ❌ | diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile index 73da4515ff4..7190ef4d649 100644 --- a/docker/keeper/Dockerfile +++ b/docker/keeper/Dockerfile @@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \ esac ARG REPOSITORY="https://s3.amazonaws.com/clickhouse-builds/22.4/31c367d3cd3aefd316778601ff6565119fe36682/package_release" -ARG VERSION="23.4.2.11" +ARG VERSION="23.5.1.3174" ARG PACKAGES="clickhouse-keeper" # user/group precreated explicitly with fixed uid/gid on purpose. diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index 1a5d2071f6b..ca966b16a2d 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -33,7 +33,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="23.4.2.11" +ARG VERSION="23.5.1.3174" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # user/group precreated explicitly with fixed uid/gid on purpose. diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index 8792d419a16..c82ac592120 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -22,7 +22,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list ARG REPO_CHANNEL="stable" ARG REPOSITORY="deb https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" -ARG VERSION="23.4.2.11" +ARG VERSION="23.5.1.3174" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # set non-empty deb_location_url url to create a docker image diff --git a/docs/changelogs/v23.5.1.3174-stable.md b/docs/changelogs/v23.5.1.3174-stable.md new file mode 100644 index 00000000000..01e5425de71 --- /dev/null +++ b/docs/changelogs/v23.5.1.3174-stable.md @@ -0,0 +1,599 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.5.1.3174-stable (2fec796e73e) FIXME as compared to v23.4.1.1943-stable (3920eb987f7) + +#### Backward Incompatible Change +* Make local object storage work consistently with s3 object storage, fix problem with append (closes [#48465](https://github.com/ClickHouse/ClickHouse/issues/48465)), make it configurable as independent storage. The change is backward incompatible because cache on top of local object storage is not incompatible to previous versions. [#48791](https://github.com/ClickHouse/ClickHouse/pull/48791) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Date_trunc function to always return datetime type. [#48851](https://github.com/ClickHouse/ClickHouse/pull/48851) ([Shane Andrade](https://github.com/mauidude)). +* Remove the experimental feature "in-memory data parts". The data format is still supported, but the settings are no-op, and compact or wide parts will be used instead. This closes [#45409](https://github.com/ClickHouse/ClickHouse/issues/45409). [#49429](https://github.com/ClickHouse/ClickHouse/pull/49429) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Changed default values of settings parallelize_output_from_storages and input_format_parquet_preserve_order. This allows ClickHouse to reorder rows when reading from files (e.g. CSV or Parquet), greatly improving performance in many cases. To restore the old behavior of preserving order, use `parallelize_output_from_storages = 0`, `input_format_parquet_preserve_order = 1`. [#49479](https://github.com/ClickHouse/ClickHouse/pull/49479) ([Michael Kolupaev](https://github.com/al13n321)). +* Make projections production-ready. Add the `optimize_use_projections` setting to control whether the projections will be selected for SELECT queries. The setting `allow_experimental_projection_optimization` is obsolete and does nothing. [#49719](https://github.com/ClickHouse/ClickHouse/pull/49719) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Mark joinGet() as non deterministic (so as dictGet). [#49843](https://github.com/ClickHouse/ClickHouse/pull/49843) ([Azat Khuzhin](https://github.com/azat)). +* Revert "`groupArray` returns cannot be nullable" (due to binary compatibility breakage for `groupArray`/`groupArrayLast`/`groupArraySample` over `Nullable` types, which likely will lead to `TOO_LARGE_ARRAY_SIZE` or `CANNOT_READ_ALL_DATA`). [#49971](https://github.com/ClickHouse/ClickHouse/pull/49971) ([Azat Khuzhin](https://github.com/azat)). + +#### New Feature +* Password type in queries like `CREATE USER u IDENTIFIED BY 'p'` will be automatically set according to the setting `default_password_type` in the `config.xml` on the server. Closes [#42915](https://github.com/ClickHouse/ClickHouse/issues/42915). [#44674](https://github.com/ClickHouse/ClickHouse/pull/44674) ([Nikolay Degterinsky](https://github.com/evillique)). +* Add bcrypt password authentication type. Closes [#34599](https://github.com/ClickHouse/ClickHouse/issues/34599). [#44905](https://github.com/ClickHouse/ClickHouse/pull/44905) ([Nikolay Degterinsky](https://github.com/evillique)). +* Added `system.zookeeper_connection` table that shows information about ZooKeeper connections. [#45245](https://github.com/ClickHouse/ClickHouse/pull/45245) ([mateng915](https://github.com/mateng0915)). +* Add urlCluster table function. Refactor all *Cluster table functions to reduce code duplication. Make schema inference work for all possible *Cluster function signatures and for named collections. Closes [#38499](https://github.com/ClickHouse/ClickHouse/issues/38499). [#45427](https://github.com/ClickHouse/ClickHouse/pull/45427) ([attack204](https://github.com/attack204)). +* Extend `first_value` and `last_value` to accept null. [#46467](https://github.com/ClickHouse/ClickHouse/pull/46467) ([lgbo](https://github.com/lgbo-ustc)). +* Add server and format settings `display_secrets_in_show_and_select` for displaying secrets of tables, databases, table functions, and dictionaries. Add privilege `displaySecretsInShowAndSelect` controlling which users can view secrets. [#46528](https://github.com/ClickHouse/ClickHouse/pull/46528) ([Mike Kot](https://github.com/myrrc)). +* Add new function `generateRandomStructure` that generates random table structure. It can be used in combination with table function `generateRandom`. [#47409](https://github.com/ClickHouse/ClickHouse/pull/47409) ([Kruglov Pavel](https://github.com/Avogar)). +* Added native ClickHouse Keeper CLI Client. [#47414](https://github.com/ClickHouse/ClickHouse/pull/47414) ([pufit](https://github.com/pufit)). +* The query cache can now be used for production workloads. [#47977](https://github.com/ClickHouse/ClickHouse/pull/47977) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix a bug that prevented the use of `CASE` without an `ELSE` branch and extended `transform` to deal with more types. Also fix some bugs that made transform() return incorrect results when decimal types were mixed with other numeric types. [#48300](https://github.com/ClickHouse/ClickHouse/pull/48300) ([Salvatore Mesoraca](https://github.com/aiven-sal)). +* Added [server-side encryption using KMS keys](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html) with S3 tables, and the `header` setting with S3 disks. Closes [#48723](https://github.com/ClickHouse/ClickHouse/issues/48723). [#48724](https://github.com/ClickHouse/ClickHouse/pull/48724) ([Johann Gan](https://github.com/johanngan)). +* Add MemoryTracker for the background tasks (merges and mutation). Introduces `merges_mutations_memory_usage_soft_limit` and `merges_mutations_memory_usage_to_ram_ratio` settings that represent the soft memory limit for merges and mutations. If this limit is reached ClickHouse won't schedule new merge or mutation tasks. Also `MergesMutationsMemoryTracking` metric is introduced to allow observing current memory usage of background tasks. Resubmit [#46089](https://github.com/ClickHouse/ClickHouse/issues/46089). Closes [#48774](https://github.com/ClickHouse/ClickHouse/issues/48774). [#48787](https://github.com/ClickHouse/ClickHouse/pull/48787) ([Dmitry Novik](https://github.com/novikd)). +* Function `dotProduct` work for array. [#49050](https://github.com/ClickHouse/ClickHouse/pull/49050) ([FFFFFFFHHHHHHH](https://github.com/FFFFFFFHHHHHHH)). +* Support statement `SHOW INDEX` to improve compatibility with MySQL. [#49158](https://github.com/ClickHouse/ClickHouse/pull/49158) ([Robert Schulze](https://github.com/rschu1ze)). +* Add virtual column `_file` and `_path` support to table function `url`. - Impove error message for table function `url`. - resolves [#49231](https://github.com/ClickHouse/ClickHouse/issues/49231) - resolves [#49232](https://github.com/ClickHouse/ClickHouse/issues/49232). [#49356](https://github.com/ClickHouse/ClickHouse/pull/49356) ([Ziyi Tan](https://github.com/Ziy1-Tan)). +* Adding the `grants` field in the users.xml file, which allows specifying grants for users. [#49381](https://github.com/ClickHouse/ClickHouse/pull/49381) ([pufit](https://github.com/pufit)). +* Add alias `str_to_map` and `mapfromstring` for `extractkeyvaluepairs`. closes [#47185](https://github.com/ClickHouse/ClickHouse/issues/47185). [#49466](https://github.com/ClickHouse/ClickHouse/pull/49466) ([flynn](https://github.com/ucasfl)). +* Support full/right join by using grace hash join algorithm. [#49483](https://github.com/ClickHouse/ClickHouse/pull/49483) ([lgbo](https://github.com/lgbo-ustc)). +* `WITH FILL` modifier groups filling by sorting prefix. Controlled by `use_with_fill_by_sorting_prefix` setting (enabled by default). Related to [#33203](https://github.com/ClickHouse/ClickHouse/issues/33203)#issuecomment-1418736794. [#49503](https://github.com/ClickHouse/ClickHouse/pull/49503) ([Igor Nikonov](https://github.com/devcrafter)). +* Add SQL functions for entropy-learned hashing. [#49656](https://github.com/ClickHouse/ClickHouse/pull/49656) ([Robert Schulze](https://github.com/rschu1ze)). +* Clickhouse-client now accepts queries after "--multiquery" when "--query" (or "-q") is absent. example: clickhouse-client --multiquery "select 1; select 2;". [#49870](https://github.com/ClickHouse/ClickHouse/pull/49870) ([Alexey Gerasimchuck](https://github.com/Demilivor)). +* Add separate `handshake_timeout` for receiving Hello packet from replica. Closes [#48854](https://github.com/ClickHouse/ClickHouse/issues/48854). [#49948](https://github.com/ClickHouse/ClickHouse/pull/49948) ([Kruglov Pavel](https://github.com/Avogar)). +* New setting s3_max_inflight_parts_for_one_file sets the limit of concurrently loaded parts with multipart upload request in scope of one file. [#49961](https://github.com/ClickHouse/ClickHouse/pull/49961) ([Sema Checherinda](https://github.com/CheSema)). +* Geographical data types (`Point`, `Ring`, `Polygon`, and `MultiPolygon`) are production-ready. [#50022](https://github.com/ClickHouse/ClickHouse/pull/50022) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Added a function "space()" which repeats a space as many times as specified. [#50103](https://github.com/ClickHouse/ClickHouse/pull/50103) ([Robert Schulze](https://github.com/rschu1ze)). +* Added --input_format_csv_trim_whitespaces option. [#50215](https://github.com/ClickHouse/ClickHouse/pull/50215) ([Alexey Gerasimchuck](https://github.com/Demilivor)). +* Added the dictGetAll function for regexp tree dictionaries to return values from multiple matches as arrays. Closes [#50254](https://github.com/ClickHouse/ClickHouse/issues/50254). [#50255](https://github.com/ClickHouse/ClickHouse/pull/50255) ([Johann Gan](https://github.com/johanngan)). +* Added toLastDayOfWeek() function to round a date or a date with time up to the nearest Saturday or Sunday. [#50315](https://github.com/ClickHouse/ClickHouse/pull/50315) ([Victor Krasnov](https://github.com/sirvickr)). +* Ability to ignore a skip index by specifying `ignore_data_skipping_indices`. [#50329](https://github.com/ClickHouse/ClickHouse/pull/50329) ([Boris Kuschel](https://github.com/bkuschel)). +* Revert 'Add SQL functions for entropy-learned hashing'. [#50416](https://github.com/ClickHouse/ClickHouse/pull/50416) ([Robert Schulze](https://github.com/rschu1ze)). +* Add `system.user_processes` table and `SHOW USER PROCESSES` query to show memory info and ProfileEvents on user level. [#50492](https://github.com/ClickHouse/ClickHouse/pull/50492) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Added storage engine `AzureBlobStorage` and `azure_blob_storage` table function. The supported set of features is very similar to storage/table function `S3`. Implements [#19307](https://github.com/ClickHouse/ClickHouse/issues/19307). [#50604](https://github.com/ClickHouse/ClickHouse/pull/50604) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). + +#### Performance Improvement +* Compress marks and primary key by default. It significantly reduces the cold query time. Upgrade notes: the support for compressed marks and primary key has been added in version 22.9. If you turned on compressed marks or primary key or installed version 23.5 or newer, which has compressed marks or primary key on by default, you will not be able to downgrade to version 22.8 or earlier. You can also explicitly disable compressed marks or primary keys by specifying the `compress_marks` and `compress_primary_key` settings in the `` section of the server configuration file. **Upgrade notes:** If you upgrade from versions prior to 22.9, you should either upgrade all replicas at once or disable the compression before upgrade, or upgrade through an intermediate version, where the compressed marks are supported but not enabled by default, such as 23.3. [#42587](https://github.com/ClickHouse/ClickHouse/pull/42587) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* When reading from multiple files reduce parallel parsing threads for each file resolves [#42192](https://github.com/ClickHouse/ClickHouse/issues/42192). [#46661](https://github.com/ClickHouse/ClickHouse/pull/46661) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Do not store blocks in `ANY` hash join if nothing is inserted. [#48633](https://github.com/ClickHouse/ClickHouse/pull/48633) ([vdimir](https://github.com/vdimir)). +* Fixes aggregate combinator `-If` when JIT compiled. Closes [#48120](https://github.com/ClickHouse/ClickHouse/issues/48120). [#49083](https://github.com/ClickHouse/ClickHouse/pull/49083) ([Igor Nikonov](https://github.com/devcrafter)). +* For reading from remote tables we use smaller tasks (instead of reading the whole part) to make tasks stealing work * task size is determined by size of columns to read * always use 1mb buffers for reading from s3 * boundaries of cache segments aligned to 1mb so they have decent size even with small tasks. it also should prevent fragmentation. [#49287](https://github.com/ClickHouse/ClickHouse/pull/49287) ([Nikita Taranov](https://github.com/nickitat)). +* Default size of a read buffer for reading from local filesystem changed to a slightly better value. Also two new settings are introduced: `max_read_buffer_size_local_fs` and `max_read_buffer_size_remote_fs`. [#49321](https://github.com/ClickHouse/ClickHouse/pull/49321) ([Nikita Taranov](https://github.com/nickitat)). +* Improve memory usage and speed of `SPARSE_HASHED`/`HASHED` dictionaries (e.g. `SPARSE_HASHED` now eats 2.6x less memory, and is ~2x faster). [#49380](https://github.com/ClickHouse/ClickHouse/pull/49380) ([Azat Khuzhin](https://github.com/azat)). +* Use aggregate projection only if it reads fewer granules than normal reading. It should help in case if query hits the PK of the table, but not the projection. Fixes [#49150](https://github.com/ClickHouse/ClickHouse/issues/49150). [#49417](https://github.com/ClickHouse/ClickHouse/pull/49417) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Optimize PODArray::resize_fill() callers. [#49459](https://github.com/ClickHouse/ClickHouse/pull/49459) ([Azat Khuzhin](https://github.com/azat)). +* Optimize the system.query_log and system.query_thread_log tables by applying LowCardinality when appropriate. The queries over these tables will be faster. [#49530](https://github.com/ClickHouse/ClickHouse/pull/49530) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Better performance when reading local Parquet files (through parallel reading). [#49539](https://github.com/ClickHouse/ClickHouse/pull/49539) ([Michael Kolupaev](https://github.com/al13n321)). +* Improve the performance of `RIGHT/FULL JOIN` by up to 2 times in certain scenarios, especially when joining a small left table with a large right table. [#49585](https://github.com/ClickHouse/ClickHouse/pull/49585) ([lgbo](https://github.com/lgbo-ustc)). +* Improve performance of BLAKE3 by 11% by enabling LTO for Rust. [#49600](https://github.com/ClickHouse/ClickHouse/pull/49600) ([Azat Khuzhin](https://github.com/azat)). +* Optimize the structure of the `system.opentelemetry_span_log`. Use `LowCardinality` where appropriate. Although this table is generally stupid (it is using the Map data type even for common attributes), it will be slightly better. [#49647](https://github.com/ClickHouse/ClickHouse/pull/49647) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Try to reserve hash table's size in `grace_hash` join. [#49816](https://github.com/ClickHouse/ClickHouse/pull/49816) ([lgbo](https://github.com/lgbo-ustc)). +* As is addresed in issue [#49748](https://github.com/ClickHouse/ClickHouse/issues/49748), the predicates with date converters, such as **toYear, toYYYYMM**, could be rewritten with the equivalent date (YYYY-MM-DD) comparisons at the AST level. And this transformation could bring performance improvement as it is free from the expensive date converter and the comparison between dates (or integers in the low level representation) is quite low-cost. The [prototype](https://github.com/ZhiguoZh/ClickHouse/commit/c7f1753f0c9363a19d95fa46f1cfed1d9f505ee0) shows that, with all identified date converters optimized, the overall QPS of the 13 queries is enhanced by **~11%** on the ICX server (Intel Xeon Platinum 8380 CPU, 80 cores, 160 threads). [#50062](https://github.com/ClickHouse/ClickHouse/pull/50062) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). +* Parallel merge of `uniqExactIf` states. Closes [#49885](https://github.com/ClickHouse/ClickHouse/issues/49885). [#50285](https://github.com/ClickHouse/ClickHouse/pull/50285) ([flynn](https://github.com/ucasfl)). +* As is addresed in issue [#49748](https://github.com/ClickHouse/ClickHouse/issues/49748), the predicates with date converters, such as toYear, toYYYYMM, could be rewritten with the equivalent date (YYYY-MM-DD) comparisons at the AST level. And this transformation could bring performance improvement as it is free from the expensive date converter and the comparison between dates (or integers in the low level representation) is quite low-cost. [#50307](https://github.com/ClickHouse/ClickHouse/pull/50307) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). +* Parallel merging supported for `uniqExact` with modifiers `-Array`, `-Merge`, `-OrNull`, `-State`. [#50413](https://github.com/ClickHouse/ClickHouse/pull/50413) ([flynn](https://github.com/ucasfl)). +* Enable LZ4_FAST_DEC_LOOP for Arm LZ4 to get 5% of decompression speed. [#50588](https://github.com/ClickHouse/ClickHouse/pull/50588) ([Daniel Kutenin](https://github.com/danlark1)). + +#### Improvement +* Add support for CGroup version 2 for asynchronous metrics about the memory usage and availability. This closes [#37983](https://github.com/ClickHouse/ClickHouse/issues/37983). [#45999](https://github.com/ClickHouse/ClickHouse/pull/45999) ([sichenzhao](https://github.com/sichenzhao)). +* Cluster table functions should always skip unavailable shards. close [#46314](https://github.com/ClickHouse/ClickHouse/issues/46314). [#46765](https://github.com/ClickHouse/ClickHouse/pull/46765) ([zk_kiger](https://github.com/zk-kiger)). +* When your csv file contains empty columns, like: ```. [#47496](https://github.com/ClickHouse/ClickHouse/pull/47496) ([你不要过来啊](https://github.com/iiiuwioajdks)). +* ROW POLICY for all tables that belong to a DATABASE. [#47640](https://github.com/ClickHouse/ClickHouse/pull/47640) ([Ilya Golshtein](https://github.com/ilejn)). +* Add Google Cloud Storage S3 compatible table function `gcs`. Like the `oss` and `cosn` functions, it is just an alias over the `s3` table function, and it does not bring any new features. [#47815](https://github.com/ClickHouse/ClickHouse/pull/47815) ([Kuba Kaflik](https://github.com/jkaflik)). +* Add ability to use strict parts size for S3 (compatibility with CloudFlare R2 S3 Storage). [#48492](https://github.com/ClickHouse/ClickHouse/pull/48492) ([Azat Khuzhin](https://github.com/azat)). +* Added new columns with info about `Replicated` database replicas to `system.clusters`: `database_shard_name`, `database_replica_name`, `is_active`. Added an optional `FROM SHARD` clause to `SYSTEM DROP DATABASE REPLICA` query. [#48548](https://github.com/ClickHouse/ClickHouse/pull/48548) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Add a new column `zookeeper_name` in system.replicas, to indicate on which (auxiliary) zookeeper cluster the replicated table's metadata is stored. [#48549](https://github.com/ClickHouse/ClickHouse/pull/48549) ([cangyin](https://github.com/cangyin)). +* `IN` operator support compare `Date` and `Date32`. Closes [#48736](https://github.com/ClickHouse/ClickHouse/issues/48736). [#48806](https://github.com/ClickHouse/ClickHouse/pull/48806) ([flynn](https://github.com/ucasfl)). +* Support for erasure codes in HDFS, author: @M1eyu2018, @tomscut. [#48833](https://github.com/ClickHouse/ClickHouse/pull/48833) ([M1eyu](https://github.com/M1eyu2018)). +* The query cache can now supports queries with totals and extremes modifier. [#48853](https://github.com/ClickHouse/ClickHouse/pull/48853) ([Robert Schulze](https://github.com/rschu1ze)). +* Introduces new keyword `INTO OUTFILE 'file.txt' APPEND`. [#48880](https://github.com/ClickHouse/ClickHouse/pull/48880) ([alekar](https://github.com/alekar)). +* The `BACKUP` command will not decrypt data from encrypted disks while making a backup. Instead the data will be stored in a backup in encrypted form. Such backups can be restored only to an encrypted disk with the same (or extended) list of encryption keys. [#48896](https://github.com/ClickHouse/ClickHouse/pull/48896) ([Vitaly Baranov](https://github.com/vitlibar)). +* Keeper improvement: add `CheckNotExists` request to Keeper. [#48897](https://github.com/ClickHouse/ClickHouse/pull/48897) ([Antonio Andelic](https://github.com/antonio2368)). +* Implement SYSTEM DROP REPLICA from auxillary ZooKeeper clusters, may be close [#48931](https://github.com/ClickHouse/ClickHouse/issues/48931). [#48932](https://github.com/ClickHouse/ClickHouse/pull/48932) ([wangxiaobo](https://github.com/wzb5212)). +* Add Array data type to MongoDB. Closes [#48598](https://github.com/ClickHouse/ClickHouse/issues/48598). [#48983](https://github.com/ClickHouse/ClickHouse/pull/48983) ([Nikolay Degterinsky](https://github.com/evillique)). +* Keeper performance improvements: avoid serializing same request twice while processing. Cache deserialization results of large requests. Controlled by new coordination setting `min_request_size_for_cache`. [#49004](https://github.com/ClickHouse/ClickHouse/pull/49004) ([Antonio Andelic](https://github.com/antonio2368)). +* Support storing `Interval` data types in tables. [#49085](https://github.com/ClickHouse/ClickHouse/pull/49085) ([larryluogit](https://github.com/larryluogit)). +* Add support for size suffixes in quota creation statement parameters. [#49087](https://github.com/ClickHouse/ClickHouse/pull/49087) ([Eridanus](https://github.com/Eridanus117)). +* Allow using `ntile` window function without explicit window frame definition: `ntile(3) OVER (ORDER BY a)`, close [#46763](https://github.com/ClickHouse/ClickHouse/issues/46763). [#49093](https://github.com/ClickHouse/ClickHouse/pull/49093) ([vdimir](https://github.com/vdimir)). +* Added settings (`number_of_mutations_to_delay`, `number_of_mutations_to_throw`) to delay or throw `ALTER` queries that create mutations (`ALTER UPDATE`, `ALTER DELETE`, `ALTER MODIFY COLUMN`, ...) in case when table already has a lot of unfinished mutations. [#49117](https://github.com/ClickHouse/ClickHouse/pull/49117) ([Anton Popov](https://github.com/CurtizJ)). +* Added setting `async_insert` for `MergeTables`. It has the same meaning as query-level setting `async_insert` and enables asynchronous inserts for specific table. Note: it doesn't take effect for insert queries from `clickhouse-client`, use query-level setting in that case. [#49122](https://github.com/ClickHouse/ClickHouse/pull/49122) ([Anton Popov](https://github.com/CurtizJ)). +* Catch exception from `create_directories` in filesystem cache. [#49203](https://github.com/ClickHouse/ClickHouse/pull/49203) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Copies embedded examples to a new field `example` in `system.functions` to supplement the field `description`. [#49222](https://github.com/ClickHouse/ClickHouse/pull/49222) ([Dan Roscigno](https://github.com/DanRoscigno)). +* Enable connection options for the MongoDB dictionary. Example: ``` xml localhost 27017 test dictionary_source ssl=true ``` ### Documentation entry for user-facing changes. [#49225](https://github.com/ClickHouse/ClickHouse/pull/49225) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). +* Added an alias `asymptotic` for `asymp` computational method for `kolmogorovSmirnovTest`. Improved documentation. [#49286](https://github.com/ClickHouse/ClickHouse/pull/49286) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Aggregation function groupBitAnd/Or/Xor now work on signed integer data. This makes them consistent with the behavior of scalar functions bitAnd/Or/Xor. [#49292](https://github.com/ClickHouse/ClickHouse/pull/49292) ([exmy](https://github.com/exmy)). +* Split function-documentation into more fine-granular fields. [#49300](https://github.com/ClickHouse/ClickHouse/pull/49300) ([Robert Schulze](https://github.com/rschu1ze)). +* Introduced settings: - `merge_max_block_size_bytes` to limit the amount of memory used for background operations. - `vertical_merge_algorithm_min_bytes_to_activate` to add another condition to activate vertical merges. [#49313](https://github.com/ClickHouse/ClickHouse/pull/49313) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Use multiple threads shared between all tables within a server to load outdated data parts. The the size of the pool and its queue is controlled by `max_outdated_parts_loading_thread_pool_size` and `outdated_part_loading_thread_pool_queue_size` settings. [#49317](https://github.com/ClickHouse/ClickHouse/pull/49317) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Don't overestimate the size of processed data for `LowCardinality` columns when they share dictionaries between blocks. This closes [#49322](https://github.com/ClickHouse/ClickHouse/issues/49322). See also [#48745](https://github.com/ClickHouse/ClickHouse/issues/48745). [#49323](https://github.com/ClickHouse/ClickHouse/pull/49323) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Parquet writer now uses reasonable row group size when invoked through OUTFILE. [#49325](https://github.com/ClickHouse/ClickHouse/pull/49325) ([Michael Kolupaev](https://github.com/al13n321)). +* Allow restricted keywords like `ARRAY` as an alias if the alias is quoted. Closes [#49324](https://github.com/ClickHouse/ClickHouse/issues/49324). [#49360](https://github.com/ClickHouse/ClickHouse/pull/49360) ([Nikolay Degterinsky](https://github.com/evillique)). +* Added possibility to use temporary tables in FROM part of ATTACH PARTITION FROM and REPLACE PARTITION FROM. [#49436](https://github.com/ClickHouse/ClickHouse/pull/49436) ([Roman Vasin](https://github.com/rvasin)). +* Data parts loading and deletion jobs were moved to shared server-wide pools instead of per-table pools. Pools sizes are controlled via settings `max_active_parts_loading_thread_pool_size`, `max_outdated_parts_loading_thread_pool_size` and `max_parts_cleaning_thread_pool_size` in top-level config. Table-level settings `max_part_loading_threads` and `max_part_removal_threads` became obsolete. [#49474](https://github.com/ClickHouse/ClickHouse/pull/49474) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Allow `?password=pass` in URL. Password is replaced in browser history. [#49505](https://github.com/ClickHouse/ClickHouse/pull/49505) ([Mike Kot](https://github.com/myrrc)). +* Allow zero objects in ReadBufferFromRemoteFSGather (because empty files are not backuped, so we might end up with zero blobs in metadata file). Closes [#49480](https://github.com/ClickHouse/ClickHouse/issues/49480). [#49519](https://github.com/ClickHouse/ClickHouse/pull/49519) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Attach thread MemoryTracker to `total_memory_tracker` after `ThreadGroup` detached. [#49527](https://github.com/ClickHouse/ClickHouse/pull/49527) ([Dmitry Novik](https://github.com/novikd)). +* Make `Pretty` formats prettier: squash blocks if not much time passed since the output of the previous block. This is controlled by a new setting `output_format_pretty_squash_ms` (100ms by default). This closes [#49153](https://github.com/ClickHouse/ClickHouse/issues/49153). [#49537](https://github.com/ClickHouse/ClickHouse/pull/49537) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add initial support to do JOINs with pure parallel replicas. [#49544](https://github.com/ClickHouse/ClickHouse/pull/49544) ([Raúl Marín](https://github.com/Algunenano)). +* Fix parameterized views when query parameter used multiple times in the query. [#49556](https://github.com/ClickHouse/ClickHouse/pull/49556) ([Azat Khuzhin](https://github.com/azat)). +* Release memory allocated for the last sent ProfileEvents snapshot in the context of a query. Followup [#47564](https://github.com/ClickHouse/ClickHouse/issues/47564). [#49561](https://github.com/ClickHouse/ClickHouse/pull/49561) ([Dmitry Novik](https://github.com/novikd)). +* Function "makeDate" now provides a MySQL-compatible overload (year & day of the year argument). [#49603](https://github.com/ClickHouse/ClickHouse/pull/49603) ([Robert Schulze](https://github.com/rschu1ze)). +* More parallelism on `Outdated` parts removal with "zero-copy replication". [#49630](https://github.com/ClickHouse/ClickHouse/pull/49630) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Reduced number of `List` ZooKeeper requests when selecting parts to merge and a lot of partitions do not have anything to merge. [#49637](https://github.com/ClickHouse/ClickHouse/pull/49637) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Support `dictionary` table function for `RegExpTreeDictionary`. [#49666](https://github.com/ClickHouse/ClickHouse/pull/49666) ([Han Fei](https://github.com/hanfei1991)). +* Added weighted fair IO scheduling policy. Added dynamic resource manager, which allows IO scheduling hierarchy to be updated in runtime w/o server restarts. [#49671](https://github.com/ClickHouse/ClickHouse/pull/49671) ([Sergei Trifonov](https://github.com/serxa)). +* Add compose request after multipart upload to GCS. This enables the usage of copy operation on objects uploaded with the multipart upload. It's recommended to set `s3_strict_upload_part_size` to some value because compose request can fail on objects created with parts of different sizes. [#49693](https://github.com/ClickHouse/ClickHouse/pull/49693) ([Antonio Andelic](https://github.com/antonio2368)). +* Improve the "best-effort" parsing logic to accept `key_value_delimiter` as a valid part of the value. This also simplifies branching and might even speed up things a bit. [#49760](https://github.com/ClickHouse/ClickHouse/pull/49760) ([Arthur Passos](https://github.com/arthurpassos)). +* Facilitate profile data association and aggregation for the same query. [#49777](https://github.com/ClickHouse/ClickHouse/pull/49777) ([helifu](https://github.com/helifu)). +* System log tables can now have custom sorting keys. [#49778](https://github.com/ClickHouse/ClickHouse/pull/49778) ([helifu](https://github.com/helifu)). +* A new field 'partitions' is used to indicate which partitions are participating in the calculation. [#49779](https://github.com/ClickHouse/ClickHouse/pull/49779) ([helifu](https://github.com/helifu)). +* Added `enable_the_endpoint_id_with_zookeeper_name_prefix` setting for `ReplicatedMergeTree` (disabled by default). When enabled, it adds ZooKeeper cluster name to table's interserver communication endpoint. It avoids `Duplicate interserver IO endpoint` errors when having replicated tables with the same path, but different auxiliary ZooKeepers. [#49780](https://github.com/ClickHouse/ClickHouse/pull/49780) ([helifu](https://github.com/helifu)). +* Add query parameters to clickhouse-local. Closes [#46561](https://github.com/ClickHouse/ClickHouse/issues/46561). [#49785](https://github.com/ClickHouse/ClickHouse/pull/49785) ([Nikolay Degterinsky](https://github.com/evillique)). +* Qpl_deflate codec lower the minimum simd version to sse 4.2. [doc change in qpl](https://github.com/intel/qpl/commit/3f8f5cea27739f5261e8fd577dc233ffe88bf679) - intel® qpl relies on a run-time kernels dispatcher and cpuid check to choose the best available implementation(sse/avx2/avx512) - restructured cmakefile for qpl build in clickhouse to align with latest upstream qpl. [#49811](https://github.com/ClickHouse/ClickHouse/pull/49811) ([jasperzhu](https://github.com/jinjunzh)). +* Allow loading dictionaries and functions from YAML by default. In previous versions, it required editing the `dictionaries_config` or `user_defined_executable_functions_config` in the configuration file, as they expected `*.xml` files. [#49812](https://github.com/ClickHouse/ClickHouse/pull/49812) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* The Kafka table engine now allows to use alias columns. [#49824](https://github.com/ClickHouse/ClickHouse/pull/49824) ([Aleksandr Musorin](https://github.com/AVMusorin)). +* Add setting to limit the max number of pairs produced by extractKeyValuePairs, safeguard to avoid using way too much memory. [#49836](https://github.com/ClickHouse/ClickHouse/pull/49836) ([Arthur Passos](https://github.com/arthurpassos)). +* Add support for (an unusual) case where the arguments in the `IN` operator are single-element tuples. [#49844](https://github.com/ClickHouse/ClickHouse/pull/49844) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). +* `bitHammingDistance` function support `String` and `FixedString` data type. Closes [#48827](https://github.com/ClickHouse/ClickHouse/issues/48827). [#49858](https://github.com/ClickHouse/ClickHouse/pull/49858) ([flynn](https://github.com/ucasfl)). +* Fix timeout resetting errors in the client on OS X. [#49863](https://github.com/ClickHouse/ClickHouse/pull/49863) ([alekar](https://github.com/alekar)). +* Add support for big integers, such as UInt128, Int128, UInt256, and Int256 in the function `bitCount`. This enables Hamming distance over large bit masks for AI applications. [#49867](https://github.com/ClickHouse/ClickHouse/pull/49867) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* This PR makes fingerprints to be used instead of key IDs in encrypted disks. [#49882](https://github.com/ClickHouse/ClickHouse/pull/49882) ([Vitaly Baranov](https://github.com/vitlibar)). +* Add UUID data type to PostgreSQL. Closes [#49739](https://github.com/ClickHouse/ClickHouse/issues/49739). [#49894](https://github.com/ClickHouse/ClickHouse/pull/49894) ([Nikolay Degterinsky](https://github.com/evillique)). +* Make `allow_experimental_query_cache` setting as obsolete for backward-compatibility. It was removed in https://github.com/ClickHouse/ClickHouse/pull/47977. [#49934](https://github.com/ClickHouse/ClickHouse/pull/49934) ([Timur Solodovnikov](https://github.com/tsolodov)). +* Function toUnixTimestamp() now accepts Date and Date32 arguments. [#49989](https://github.com/ClickHouse/ClickHouse/pull/49989) ([Victor Krasnov](https://github.com/sirvickr)). +* Charge only server memory for dictionaries. [#49995](https://github.com/ClickHouse/ClickHouse/pull/49995) ([Azat Khuzhin](https://github.com/azat)). +* Add schema inference to PostgreSQL, MySQL, MeiliSearch, and SQLite table engines. Closes [#49972](https://github.com/ClickHouse/ClickHouse/issues/49972). [#50000](https://github.com/ClickHouse/ClickHouse/pull/50000) ([Nikolay Degterinsky](https://github.com/evillique)). +* The server will allow using the `SQL_*` settings such as `SQL_AUTO_IS_NULL` as no-ops for MySQL compatibility. This closes [#49927](https://github.com/ClickHouse/ClickHouse/issues/49927). [#50013](https://github.com/ClickHouse/ClickHouse/pull/50013) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Preserve initial_query_id for ON CLUSTER queries, which is useful for introspection (under `distributed_ddl_entry_format_version=5`). [#50015](https://github.com/ClickHouse/ClickHouse/pull/50015) ([Azat Khuzhin](https://github.com/azat)). +* Preserve backward incompatibility for renamed settings by using aliases (`allow_experimental_projection_optimization` for `optimize_use_projections`, `allow_experimental_lightweight_delete` for `enable_lightweight_delete`). [#50044](https://github.com/ClickHouse/ClickHouse/pull/50044) ([Azat Khuzhin](https://github.com/azat)). +* Support cross-replication in distributed queries using the new infrastructure. [#50097](https://github.com/ClickHouse/ClickHouse/pull/50097) ([Dmitry Novik](https://github.com/novikd)). +* Support passing fqdn through setting my_hostname to register cluster node in keeper. Add setting of invisible to support multi compute groups. A compute group as a cluster, is invisible to other compute groups. [#50186](https://github.com/ClickHouse/ClickHouse/pull/50186) ([Yangkuan Liu](https://github.com/LiuYangkuan)). +* Fix PostgreSQL reading all the data even though `LIMIT n` could be specified. [#50187](https://github.com/ClickHouse/ClickHouse/pull/50187) ([Kseniia Sumarokova](https://github.com/kssenii)). +* 1) Fixed an error `NOT_FOUND_COLUMN_IN_BLOCK` in case of using parallel replicas with non-replicated storage with disabled setting `parallel_replicas_for_non_replicated_merge_tree` 2) Now `allow_experimental_parallel_reading_from_replicas` have 3 possible values - 0, 1 and 2. 0 - disabled, 1 - enabled, silently disable them in case of failure (in case of FINAL or JOIN), 2 - enabled, throw an expection in case of failure. 3) If FINAL modifier is used in SELECT query and parallel replicas are enabled, ClickHouse will try to disable them if `allow_experimental_parallel_reading_from_replicas` is set to 1 and throw an exception otherwise. [#50195](https://github.com/ClickHouse/ClickHouse/pull/50195) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Don't send head request for all keys in Iceberg schema inference, only for keys that are used for reaing data. [#50203](https://github.com/ClickHouse/ClickHouse/pull/50203) ([Kruglov Pavel](https://github.com/Avogar)). +* Add new profile events for queries with subqueries (`QueriesWithSubqueries`/`SelectQueriesWithSubqueries`/`InsertQueriesWithSubqueries`). [#50204](https://github.com/ClickHouse/ClickHouse/pull/50204) ([Azat Khuzhin](https://github.com/azat)). +* Adding the roles field in the users.xml file, which allows specifying roles with grants via a config file. [#50278](https://github.com/ClickHouse/ClickHouse/pull/50278) ([pufit](https://github.com/pufit)). +* When parallel replicas are enabled they will always skip unavailable servers (the behavior is controlled by the setting `skip_unavailable_shards`, enabled by default and can be only disabled). This closes: [#48565](https://github.com/ClickHouse/ClickHouse/issues/48565). [#50293](https://github.com/ClickHouse/ClickHouse/pull/50293) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Fix a typo. [#50306](https://github.com/ClickHouse/ClickHouse/pull/50306) ([helifu](https://github.com/helifu)). +* Setting `enable_memory_bound_merging_of_aggregation_results` is enabled by default. If you update from version prior to 22.12, we recommend to set this flag to `false` until update is finished. [#50319](https://github.com/ClickHouse/ClickHouse/pull/50319) ([Nikita Taranov](https://github.com/nickitat)). +* Report `CGroupCpuCfsPeriod` and `CGroupCpuCfsQuota` in AsynchronousMetrics. - Respect cgroup v2 memory limits during server startup. [#50379](https://github.com/ClickHouse/ClickHouse/pull/50379) ([alekar](https://github.com/alekar)). +* Bump internal protobuf to v3.18 (fixes CVE-2022-1941). [#50400](https://github.com/ClickHouse/ClickHouse/pull/50400) ([Robert Schulze](https://github.com/rschu1ze)). +* Bump internal libxml2 to v2.10.4 (fixes CVE-2023-28484 and CVE-2023-29469). [#50402](https://github.com/ClickHouse/ClickHouse/pull/50402) ([Robert Schulze](https://github.com/rschu1ze)). +* Bump c-ares to v1.19.1 (CVE-2023-32067, CVE-2023-31130, CVE-2023-31147). [#50403](https://github.com/ClickHouse/ClickHouse/pull/50403) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix CVE-2022-2469 in libgsasl. [#50404](https://github.com/ClickHouse/ClickHouse/pull/50404) ([Robert Schulze](https://github.com/rschu1ze)). +* Make filter push down through cross join. [#50430](https://github.com/ClickHouse/ClickHouse/pull/50430) ([Han Fei](https://github.com/hanfei1991)). +* Add a signal handler for SIGQUIT to work the same way as SIGINT. Closes [#50298](https://github.com/ClickHouse/ClickHouse/issues/50298). [#50435](https://github.com/ClickHouse/ClickHouse/pull/50435) ([Nikolay Degterinsky](https://github.com/evillique)). +* In case JSON parse fails due to the large size of the object output the last position to allow debugging. [#50474](https://github.com/ClickHouse/ClickHouse/pull/50474) ([Valentin Alexeev](https://github.com/valentinalexeev)). +* Support decimals with not fixed size. Closes [#49130](https://github.com/ClickHouse/ClickHouse/issues/49130). [#50586](https://github.com/ClickHouse/ClickHouse/pull/50586) ([Kruglov Pavel](https://github.com/Avogar)). +* Disable pure parallel replicas if trivial count optimization is possible. [#50594](https://github.com/ClickHouse/ClickHouse/pull/50594) ([Raúl Marín](https://github.com/Algunenano)). +* Added support of TRUNCATE db.table additional to TRUNCATE TABLE db.table in MaterializedMySQL. [#50624](https://github.com/ClickHouse/ClickHouse/pull/50624) ([Val Doroshchuk](https://github.com/valbok)). +* Disable parallel replicas automatically when the estimated number of granules is less than threshold. The behavior is controlled by a setting `parallel_replicas_min_number_of_granules_to_enable`. [#50639](https://github.com/ClickHouse/ClickHouse/pull/50639) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* When creating skipping indexes via "ALTER TABLE table ADD INDEX", the "GRANULARITY" clause can now be omitted. In that case, GRANULARITY is assumed to be 1. [#50658](https://github.com/ClickHouse/ClickHouse/pull/50658) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix slow cache in presence of big inserts. [#50680](https://github.com/ClickHouse/ClickHouse/pull/50680) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Set default max_elements limit in filesystem cache to 10000000. [#50682](https://github.com/ClickHouse/ClickHouse/pull/50682) ([Kseniia Sumarokova](https://github.com/kssenii)). +* SHOW INDICES is now an alias of statement SHOW INDEX/INDEXES/KEYS. [#50713](https://github.com/ClickHouse/ClickHouse/pull/50713) ([Robert Schulze](https://github.com/rschu1ze)). + +#### Build/Testing/Packaging Improvement +* New and improved keeper-bench. Everything can be customized from yaml/XML file: - request generator - each type of request generator can have a specific set of fields - multi requests can be generated just by doing the same under `multi` key - for each request or subrequest in multi a `weight` field can be defined to control distribution - define trees that need to be setup for a test run - hosts can be defined with all timeouts customizable and it's possible to control how many sessions to generate for each host - integers defined with `min_value` and `max_value` fields are random number generators. [#48547](https://github.com/ClickHouse/ClickHouse/pull/48547) ([Antonio Andelic](https://github.com/antonio2368)). +* ... Add a test to check max_rows_to_read_leaf behaviour. [#48950](https://github.com/ClickHouse/ClickHouse/pull/48950) ([Sean Haynes](https://github.com/seandhaynes)). +* Io_uring is not supported on macos, don't choose it when running tests on local to avoid occassional failures. [#49250](https://github.com/ClickHouse/ClickHouse/pull/49250) ([Frank Chen](https://github.com/FrankChen021)). +* Support named fault injection for testing. [#49361](https://github.com/ClickHouse/ClickHouse/pull/49361) ([Han Fei](https://github.com/hanfei1991)). +* Fix the 01193_metadata_loading test to match the query execution time specific to s390x. [#49455](https://github.com/ClickHouse/ClickHouse/pull/49455) ([MeenaRenganathan22](https://github.com/MeenaRenganathan22)). +* Use the RapidJSONParser library to parse the JSON float values in case of s390x. [#49457](https://github.com/ClickHouse/ClickHouse/pull/49457) ([MeenaRenganathan22](https://github.com/MeenaRenganathan22)). +* Allow running ClickHouse in the OS where the `prctl` (process control) syscall is not available, such as AWS Lambda. [#49538](https://github.com/ClickHouse/ClickHouse/pull/49538) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Improve CI check with an enabled analyzer. Now it should be green if only tests from `tests/broken_tests.txt` are broken. [#49562](https://github.com/ClickHouse/ClickHouse/pull/49562) ([Dmitry Novik](https://github.com/novikd)). +* Fixed the issue of build conflict between contrib/isa-l and isa-l in qpl [49296](https://github.com/ClickHouse/ClickHouse/issues/49296). [#49584](https://github.com/ClickHouse/ClickHouse/pull/49584) ([jasperzhu](https://github.com/jinjunzh)). +* Utilities are now only build if explicitly requested ("-DENABLE_UTILS=1") instead of by default, this reduces link times in typical development builds. [#49620](https://github.com/ClickHouse/ClickHouse/pull/49620) ([Robert Schulze](https://github.com/rschu1ze)). +* Pull build description of idxd-config into a separate CMake file to avoid accidental removal in future. [#49651](https://github.com/ClickHouse/ClickHouse/pull/49651) ([jasperzhu](https://github.com/jinjunzh)). +* Add CI check with an enabled analyzer in the master. Followup [#49562](https://github.com/ClickHouse/ClickHouse/issues/49562). [#49668](https://github.com/ClickHouse/ClickHouse/pull/49668) ([Dmitry Novik](https://github.com/novikd)). +* Switch to LLVM/clang 16. [#49678](https://github.com/ClickHouse/ClickHouse/pull/49678) ([Azat Khuzhin](https://github.com/azat)). +* Fixed DefaultHash64 for non-64 bit integers on s390x. [#49833](https://github.com/ClickHouse/ClickHouse/pull/49833) ([Harry Lee](https://github.com/HarryLeeIBM)). +* Allow building ClickHouse with clang-17. [#49851](https://github.com/ClickHouse/ClickHouse/pull/49851) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* ClickHouse is now easier to be integrated into other cmake projects. [#49991](https://github.com/ClickHouse/ClickHouse/pull/49991) ([Amos Bird](https://github.com/amosbird)). +* Link `boost::context` library to `clickhouse_common_io`. This closes: [#50381](https://github.com/ClickHouse/ClickHouse/issues/50381). [#50385](https://github.com/ClickHouse/ClickHouse/pull/50385) ([HaiBo Li](https://github.com/marising)). +* Add support for building with clang-17. [#50410](https://github.com/ClickHouse/ClickHouse/pull/50410) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix strange additional QEMU logging after [#47151](https://github.com/ClickHouse/ClickHouse/issues/47151), see https://s3.amazonaws.com/clickhouse-test-reports/50078/a4743996ee4f3583884d07bcd6501df0cfdaa346/stateless_tests__release__databasereplicated__[3_4].html. [#50442](https://github.com/ClickHouse/ClickHouse/pull/50442) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* ClickHouse can work on Linux RISC-V 6.1.22. This closes [#50456](https://github.com/ClickHouse/ClickHouse/issues/50456). [#50457](https://github.com/ClickHouse/ClickHouse/pull/50457) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* ActionsDAG: fix wrong optimization [#47584](https://github.com/ClickHouse/ClickHouse/pull/47584) ([Salvatore Mesoraca](https://github.com/aiven-sal)). +* Correctly handle concurrent snapshots in Keeper [#48466](https://github.com/ClickHouse/ClickHouse/pull/48466) ([Antonio Andelic](https://github.com/antonio2368)). +* MergeTreeMarksLoader holds DataPart instead of DataPartStorage [#48515](https://github.com/ClickHouse/ClickHouse/pull/48515) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* sequence state fix [#48603](https://github.com/ClickHouse/ClickHouse/pull/48603) ([Ilya Golshtein](https://github.com/ilejn)). +* Back/Restore concurrency check on previous fails [#48726](https://github.com/ClickHouse/ClickHouse/pull/48726) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix Attaching a table with non-existent ZK path does not increase the ReadonlyReplica metric [#48954](https://github.com/ClickHouse/ClickHouse/pull/48954) ([wangxiaobo](https://github.com/wzb5212)). +* Fix possible terminate called for uncaught exception in some places [#49112](https://github.com/ClickHouse/ClickHouse/pull/49112) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix key not found error for queries with multiple StorageJoin [#49137](https://github.com/ClickHouse/ClickHouse/pull/49137) ([vdimir](https://github.com/vdimir)). +* Fix wrong query result when using nullable primary key [#49172](https://github.com/ClickHouse/ClickHouse/pull/49172) ([Duc Canh Le](https://github.com/canhld94)). +* Revert "Fix GCS native copy ([#48981](https://github.com/ClickHouse/ClickHouse/issues/48981))" [#49194](https://github.com/ClickHouse/ClickHouse/pull/49194) ([Raúl Marín](https://github.com/Algunenano)). +* Fix reinterpretAs*() on big endian machines [#49198](https://github.com/ClickHouse/ClickHouse/pull/49198) ([Suzy Wang](https://github.com/SuzyWangIBMer)). +* Lock zero copy parts more atomically [#49211](https://github.com/ClickHouse/ClickHouse/pull/49211) ([alesapin](https://github.com/alesapin)). +* Fix race on Outdated parts loading [#49223](https://github.com/ClickHouse/ClickHouse/pull/49223) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix all key value is null and group use rollup return wrong answer [#49282](https://github.com/ClickHouse/ClickHouse/pull/49282) ([Shuai li](https://github.com/loneylee)). +* Fix calculating load_factor for HASHED dictionaries with SHARDS [#49319](https://github.com/ClickHouse/ClickHouse/pull/49319) ([Azat Khuzhin](https://github.com/azat)). +* Disallow configuring compression CODECs for alias columns [#49363](https://github.com/ClickHouse/ClickHouse/pull/49363) ([Timur Solodovnikov](https://github.com/tsolodov)). +* Fix bug in removal of existing part directory [#49365](https://github.com/ClickHouse/ClickHouse/pull/49365) ([alesapin](https://github.com/alesapin)). +* Properly fix GCS when HMAC is used [#49390](https://github.com/ClickHouse/ClickHouse/pull/49390) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix fuzz bug when subquery set is not built when reading from remote() [#49425](https://github.com/ClickHouse/ClickHouse/pull/49425) ([Alexander Gololobov](https://github.com/davenger)). +* Invert `shutdown_wait_unfinished_queries` [#49427](https://github.com/ClickHouse/ClickHouse/pull/49427) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Fix another zero copy bug [#49473](https://github.com/ClickHouse/ClickHouse/pull/49473) ([alesapin](https://github.com/alesapin)). +* Fix postgres database setting [#49481](https://github.com/ClickHouse/ClickHouse/pull/49481) ([Mal Curtis](https://github.com/snikch)). +* Correctly handle s3Cluster arguments [#49490](https://github.com/ClickHouse/ClickHouse/pull/49490) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix bug in TraceCollector destructor. [#49508](https://github.com/ClickHouse/ClickHouse/pull/49508) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Fix AsynchronousReadIndirectBufferFromRemoteFS breaking on short seeks [#49525](https://github.com/ClickHouse/ClickHouse/pull/49525) ([Michael Kolupaev](https://github.com/al13n321)). +* Fix dictionaries loading order [#49560](https://github.com/ClickHouse/ClickHouse/pull/49560) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Forbid the change of data type of Object('json') column [#49563](https://github.com/ClickHouse/ClickHouse/pull/49563) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix stress test (Logical error: Expected 7134 >= 11030) [#49623](https://github.com/ClickHouse/ClickHouse/pull/49623) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix bug in DISTINCT [#49628](https://github.com/ClickHouse/ClickHouse/pull/49628) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix: DISTINCT in order with zero values in non-sorted columns [#49636](https://github.com/ClickHouse/ClickHouse/pull/49636) ([Igor Nikonov](https://github.com/devcrafter)). +* Fix one-off error in big integers found by UBSan with fuzzer [#49645](https://github.com/ClickHouse/ClickHouse/pull/49645) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix reading from sparse columns after restart [#49660](https://github.com/ClickHouse/ClickHouse/pull/49660) ([Anton Popov](https://github.com/CurtizJ)). +* Fix assert in SpanHolder::finish() with fibers [#49673](https://github.com/ClickHouse/ClickHouse/pull/49673) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix short circuit functions and mutations with sparse arguments [#49716](https://github.com/ClickHouse/ClickHouse/pull/49716) ([Anton Popov](https://github.com/CurtizJ)). +* Fix writing appended files to incremental backups [#49725](https://github.com/ClickHouse/ClickHouse/pull/49725) ([Vitaly Baranov](https://github.com/vitlibar)). +* Ignore LWD column in checkPartDynamicColumns [#49737](https://github.com/ClickHouse/ClickHouse/pull/49737) ([Alexander Gololobov](https://github.com/davenger)). +* Fix msan issue in randomStringUTF8() [#49750](https://github.com/ClickHouse/ClickHouse/pull/49750) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix aggregate function kolmogorovSmirnovTest [#49768](https://github.com/ClickHouse/ClickHouse/pull/49768) ([FFFFFFFHHHHHHH](https://github.com/FFFFFFFHHHHHHH)). +* Fix settings aliases in native protocol [#49776](https://github.com/ClickHouse/ClickHouse/pull/49776) ([Azat Khuzhin](https://github.com/azat)). +* Fix `arrayMap` with array of tuples with single argument [#49789](https://github.com/ClickHouse/ClickHouse/pull/49789) ([Anton Popov](https://github.com/CurtizJ)). +* Fix per-query IO/BACKUPs throttling settings [#49797](https://github.com/ClickHouse/ClickHouse/pull/49797) ([Azat Khuzhin](https://github.com/azat)). +* Fix setting NULL in profile definition [#49831](https://github.com/ClickHouse/ClickHouse/pull/49831) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix a bug with projections and the aggregate_functions_null_for_empty setting (for query_plan_optimize_projection) [#49873](https://github.com/ClickHouse/ClickHouse/pull/49873) ([Amos Bird](https://github.com/amosbird)). +* Fix processing pending batch for Distributed async INSERT after restart [#49884](https://github.com/ClickHouse/ClickHouse/pull/49884) ([Azat Khuzhin](https://github.com/azat)). +* Fix assertion in CacheMetadata::doCleanup [#49914](https://github.com/ClickHouse/ClickHouse/pull/49914) ([Kseniia Sumarokova](https://github.com/kssenii)). +* fix `is_prefix` in OptimizeRegularExpression [#49919](https://github.com/ClickHouse/ClickHouse/pull/49919) ([Han Fei](https://github.com/hanfei1991)). +* Fix metrics `WriteBufferFromS3Bytes`, `WriteBufferFromS3Microseconds` and `WriteBufferFromS3RequestsErrors` [#49930](https://github.com/ClickHouse/ClickHouse/pull/49930) ([Aleksandr Musorin](https://github.com/AVMusorin)). +* Fix IPv6 encoding in protobuf [#49933](https://github.com/ClickHouse/ClickHouse/pull/49933) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Fix possible Logical error on bad Nullable parsing for text formats [#49960](https://github.com/ClickHouse/ClickHouse/pull/49960) ([Kruglov Pavel](https://github.com/Avogar)). +* Add setting output_format_parquet_compliant_nested_types to produce more compatible Parquet files [#50001](https://github.com/ClickHouse/ClickHouse/pull/50001) ([Michael Kolupaev](https://github.com/al13n321)). +* Fix logical error in stress test "Not enough space to add ..." [#50021](https://github.com/ClickHouse/ClickHouse/pull/50021) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Avoid deadlock when starting table in attach thread of `ReplicatedMergeTree` [#50026](https://github.com/ClickHouse/ClickHouse/pull/50026) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix assert in SpanHolder::finish() with fibers attempt 2 [#50034](https://github.com/ClickHouse/ClickHouse/pull/50034) ([Kruglov Pavel](https://github.com/Avogar)). +* Add proper escaping for DDL OpenTelemetry context serialization [#50045](https://github.com/ClickHouse/ClickHouse/pull/50045) ([Azat Khuzhin](https://github.com/azat)). +* Fix reporting broken projection parts [#50052](https://github.com/ClickHouse/ClickHouse/pull/50052) ([Amos Bird](https://github.com/amosbird)). +* JIT compilation not equals NaN fix [#50056](https://github.com/ClickHouse/ClickHouse/pull/50056) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix crashing in case of Replicated database without arguments [#50058](https://github.com/ClickHouse/ClickHouse/pull/50058) ([Azat Khuzhin](https://github.com/azat)). +* Fix crash with `multiIf` and constant condition and nullable arguments [#50123](https://github.com/ClickHouse/ClickHouse/pull/50123) ([Anton Popov](https://github.com/CurtizJ)). +* Fix invalid index analysis for date related keys [#50153](https://github.com/ClickHouse/ClickHouse/pull/50153) ([Amos Bird](https://github.com/amosbird)). +* do not allow modify order by when there are no order by cols [#50154](https://github.com/ClickHouse/ClickHouse/pull/50154) ([Han Fei](https://github.com/hanfei1991)). +* Fix broken index analysis when binary operator contains a null constant argument [#50177](https://github.com/ClickHouse/ClickHouse/pull/50177) ([Amos Bird](https://github.com/amosbird)). +* clickhouse-client: disallow usage of `--query` and `--queries-file` at the same time [#50210](https://github.com/ClickHouse/ClickHouse/pull/50210) ([Alexey Gerasimchuck](https://github.com/Demilivor)). +* Fix UB for INTO OUTFILE extensions (APPEND / AND STDOUT) and WATCH EVENTS [#50216](https://github.com/ClickHouse/ClickHouse/pull/50216) ([Azat Khuzhin](https://github.com/azat)). +* Fix skipping spaces at end of row in CustomSeparatedIgnoreSpaces format [#50224](https://github.com/ClickHouse/ClickHouse/pull/50224) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix iceberg metadata parsing [#50232](https://github.com/ClickHouse/ClickHouse/pull/50232) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix nested distributed SELECT in WITH clause [#50234](https://github.com/ClickHouse/ClickHouse/pull/50234) ([Azat Khuzhin](https://github.com/azat)). +* Fix reconnecting of HTTPS session when target host IP was changed [#50240](https://github.com/ClickHouse/ClickHouse/pull/50240) ([Aleksei Filatov](https://github.com/aalexfvk)). +* Fix msan issue in keyed siphash [#50245](https://github.com/ClickHouse/ClickHouse/pull/50245) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix bugs in Poco sockets in non-blocking mode, use true non-blocking sockets [#50252](https://github.com/ClickHouse/ClickHouse/pull/50252) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix checksum calculation for backup entries [#50264](https://github.com/ClickHouse/ClickHouse/pull/50264) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fixed type conversion from Date/Date32 to DateTime64 when querying with DateTime64 index [#50280](https://github.com/ClickHouse/ClickHouse/pull/50280) ([Lucas Chang](https://github.com/lucas-tubi)). +* Comparison functions NaN fix [#50287](https://github.com/ClickHouse/ClickHouse/pull/50287) ([Maksim Kita](https://github.com/kitaisreal)). +* JIT aggregation nullable key fix [#50291](https://github.com/ClickHouse/ClickHouse/pull/50291) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix clickhouse-local crashing when writing empty Arrow or Parquet output [#50328](https://github.com/ClickHouse/ClickHouse/pull/50328) ([Michael Kolupaev](https://github.com/al13n321)). +* Fix crash when Pool::Entry::disconnect() is called [#50334](https://github.com/ClickHouse/ClickHouse/pull/50334) ([Val Doroshchuk](https://github.com/valbok)). +* Improved fetch part by holding directory lock longer [#50339](https://github.com/ClickHouse/ClickHouse/pull/50339) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix bitShift* functions with both constant arguments [#50343](https://github.com/ClickHouse/ClickHouse/pull/50343) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix Keeper deadlock on exception when preprocessing requests. [#50387](https://github.com/ClickHouse/ClickHouse/pull/50387) ([frinkr](https://github.com/frinkr)). +* Fix hashing of const integer values [#50421](https://github.com/ClickHouse/ClickHouse/pull/50421) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix excessive memory usage for FINAL (due to too much streams usage) [#50429](https://github.com/ClickHouse/ClickHouse/pull/50429) ([Azat Khuzhin](https://github.com/azat)). +* Fix merge_tree_min_rows_for_seek/merge_tree_min_bytes_for_seek for data skipping indexes [#50432](https://github.com/ClickHouse/ClickHouse/pull/50432) ([Azat Khuzhin](https://github.com/azat)). +* Limit the number of in-flight tasks for loading outdated parts [#50450](https://github.com/ClickHouse/ClickHouse/pull/50450) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Keeper fix: apply uncommitted state after snapshot install [#50483](https://github.com/ClickHouse/ClickHouse/pull/50483) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix incorrect constant folding [#50536](https://github.com/ClickHouse/ClickHouse/pull/50536) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix logical error in stress test (Not enough space to add ...) [#50583](https://github.com/ClickHouse/ClickHouse/pull/50583) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix converting Null to LowCardinality(Nullable) in values table function [#50637](https://github.com/ClickHouse/ClickHouse/pull/50637) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix crash in anti/semi join [#50638](https://github.com/ClickHouse/ClickHouse/pull/50638) ([vdimir](https://github.com/vdimir)). +* Revert invalid RegExpTreeDictionary optimization [#50642](https://github.com/ClickHouse/ClickHouse/pull/50642) ([Johann Gan](https://github.com/johanngan)). +* Correctly disable async insert with deduplication when it's not needed [#50663](https://github.com/ClickHouse/ClickHouse/pull/50663) ([Antonio Andelic](https://github.com/antonio2368)). + +#### Build Improvement + +* Fixed Functional Test 00870_t64_codec, 00871_t64_codec_signed, 00872_t64_bit_codec. [#49658](https://github.com/ClickHouse/ClickHouse/pull/49658) ([Sanjam Panda](https://github.com/saitama951)). + +#### NO CL ENTRY + +* NO CL ENTRY: 'Fix user MemoryTracker counter in async inserts'. [#47630](https://github.com/ClickHouse/ClickHouse/pull/47630) ([Dmitry Novik](https://github.com/novikd)). +* NO CL ENTRY: 'Revert "Make `Pretty` formats even prettier."'. [#49850](https://github.com/ClickHouse/ClickHouse/pull/49850) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* NO CL ENTRY: 'Update first_value.md:remove redundant 's''. [#50331](https://github.com/ClickHouse/ClickHouse/pull/50331) ([sslouis](https://github.com/savezed)). +* NO CL ENTRY: 'Revert "less logs in WriteBufferFromS3"'. [#50390](https://github.com/ClickHouse/ClickHouse/pull/50390) ([Alexander Tokmakov](https://github.com/tavplubix)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Attempt to fix the "system.stack_trace" test [#44627](https://github.com/ClickHouse/ClickHouse/pull/44627) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* rework WriteBufferFromS3, add tests, add abortion [#44869](https://github.com/ClickHouse/ClickHouse/pull/44869) ([Sema Checherinda](https://github.com/CheSema)). +* Rework locking in fs cache [#44985](https://github.com/ClickHouse/ClickHouse/pull/44985) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Update ubuntu_ami_for_ci.sh [#47151](https://github.com/ClickHouse/ClickHouse/pull/47151) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Implement status comment [#48468](https://github.com/ClickHouse/ClickHouse/pull/48468) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Update curl to 8.0.1 (for CVEs) [#48765](https://github.com/ClickHouse/ClickHouse/pull/48765) ([Boris Kuschel](https://github.com/bkuschel)). +* Fix some tests [#48792](https://github.com/ClickHouse/ClickHouse/pull/48792) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Bug Fix for 02432_s3_parallel_parts_cleanup.sql with zero copy replication [#48865](https://github.com/ClickHouse/ClickHouse/pull/48865) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Add AsyncLoader with dependency tracking and runtime prioritization [#48923](https://github.com/ClickHouse/ClickHouse/pull/48923) ([Sergei Trifonov](https://github.com/serxa)). +* Fix incorrect createColumn call on join clause [#48998](https://github.com/ClickHouse/ClickHouse/pull/48998) ([Ongkong](https://github.com/ongkong)). +* Try fix flaky 01346_alter_enum_partition_key_replicated_zookeeper_long [#49099](https://github.com/ClickHouse/ClickHouse/pull/49099) ([Sergei Trifonov](https://github.com/serxa)). +* Fix possible logical error "Cannot cancel. Either no query sent or already cancelled" [#49106](https://github.com/ClickHouse/ClickHouse/pull/49106) ([Kruglov Pavel](https://github.com/Avogar)). +* Refactor ColumnLowCardinality::cutAndCompact [#49111](https://github.com/ClickHouse/ClickHouse/pull/49111) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix tests with enabled analyzer [#49116](https://github.com/ClickHouse/ClickHouse/pull/49116) ([Dmitry Novik](https://github.com/novikd)). +* Use `SharedMutex` instead of `UpgradableMutex` [#49139](https://github.com/ClickHouse/ClickHouse/pull/49139) ([Sergei Trifonov](https://github.com/serxa)). +* Don't add metadata_version file if it doesn't exist [#49146](https://github.com/ClickHouse/ClickHouse/pull/49146) ([alesapin](https://github.com/alesapin)). +* clearing s3 between tests in a robust way [#49157](https://github.com/ClickHouse/ClickHouse/pull/49157) ([Sema Checherinda](https://github.com/CheSema)). +* Align connect timeout with aws sdk default [#49161](https://github.com/ClickHouse/ClickHouse/pull/49161) ([Nikita Taranov](https://github.com/nickitat)). +* Fix test_encrypted_disk_replication [#49193](https://github.com/ClickHouse/ClickHouse/pull/49193) ([Vitaly Baranov](https://github.com/vitlibar)). +* Allow using function `concat` with `Map` type [#49200](https://github.com/ClickHouse/ClickHouse/pull/49200) ([Anton Popov](https://github.com/CurtizJ)). +* Slight improvements to coordinator logging [#49204](https://github.com/ClickHouse/ClickHouse/pull/49204) ([Raúl Marín](https://github.com/Algunenano)). +* Fix some typos in conversion functions [#49221](https://github.com/ClickHouse/ClickHouse/pull/49221) ([Raúl Marín](https://github.com/Algunenano)). +* CMake: Remove some GCC-specific code [#49224](https://github.com/ClickHouse/ClickHouse/pull/49224) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix oss-fuzz build errors [#49236](https://github.com/ClickHouse/ClickHouse/pull/49236) ([Nikita Taranov](https://github.com/nickitat)). +* Update version after release [#49237](https://github.com/ClickHouse/ClickHouse/pull/49237) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Update version_date.tsv and changelogs after v23.4.1.1943-stable [#49239](https://github.com/ClickHouse/ClickHouse/pull/49239) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Merge [#24050](https://github.com/ClickHouse/ClickHouse/issues/24050) [#49240](https://github.com/ClickHouse/ClickHouse/pull/49240) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add file name to exception raised during decompression [#49241](https://github.com/ClickHouse/ClickHouse/pull/49241) ([Nikolay Degterinsky](https://github.com/evillique)). +* Disable ISA-L on aarch64 architectures [#49256](https://github.com/ClickHouse/ClickHouse/pull/49256) ([Jordi Villar](https://github.com/jrdi)). +* Add a comment in FileCache.cpp [#49260](https://github.com/ClickHouse/ClickHouse/pull/49260) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix garbage [#48719](https://github.com/ClickHouse/ClickHouse/issues/48719) [#49263](https://github.com/ClickHouse/ClickHouse/pull/49263) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Update build for nasm [#49288](https://github.com/ClickHouse/ClickHouse/pull/49288) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix race in `waitForProcessingQueue` [#49302](https://github.com/ClickHouse/ClickHouse/pull/49302) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix stress test [#49309](https://github.com/ClickHouse/ClickHouse/pull/49309) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix 02516_join_with_totals_and_subquery_bug with new analyzer [#49310](https://github.com/ClickHouse/ClickHouse/pull/49310) ([Dmitry Novik](https://github.com/novikd)). +* Fallback auth gh api [#49314](https://github.com/ClickHouse/ClickHouse/pull/49314) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Unpoison stack frame ptrs from libunwind for msan [#49316](https://github.com/ClickHouse/ClickHouse/pull/49316) ([Robert Schulze](https://github.com/rschu1ze)). +* Respect projections in 01600_parts [#49318](https://github.com/ClickHouse/ClickHouse/pull/49318) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* move pipe compute into initializePipeline [#49326](https://github.com/ClickHouse/ClickHouse/pull/49326) ([Konstantin Morozov](https://github.com/k-morozov)). +* Fix compiling average example (suppress -Wframe-larger-than) [#49358](https://github.com/ClickHouse/ClickHouse/pull/49358) ([Azat Khuzhin](https://github.com/azat)). +* Fix join_use_nulls in analyzer [#49359](https://github.com/ClickHouse/ClickHouse/pull/49359) ([vdimir](https://github.com/vdimir)). +* Fix 02680_mysql_ast_logical_err in analyzer [#49362](https://github.com/ClickHouse/ClickHouse/pull/49362) ([vdimir](https://github.com/vdimir)). +* Remove wrong assertion in cache [#49376](https://github.com/ClickHouse/ClickHouse/pull/49376) ([Kseniia Sumarokova](https://github.com/kssenii)). +* A better way of excluding ISA-L on non-x86 [#49378](https://github.com/ClickHouse/ClickHouse/pull/49378) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix decimal aggregates test for s390x [#49382](https://github.com/ClickHouse/ClickHouse/pull/49382) ([Harry Lee](https://github.com/HarryLeeIBM)). +* Move logging one line higher [#49387](https://github.com/ClickHouse/ClickHouse/pull/49387) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Improve CI: status commit, auth for get_gh_api [#49388](https://github.com/ClickHouse/ClickHouse/pull/49388) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix printing hung queries in clickhouse-test. [#49389](https://github.com/ClickHouse/ClickHouse/pull/49389) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Correctly stop CNF convert for too many atomics in new analyzer [#49402](https://github.com/ClickHouse/ClickHouse/pull/49402) ([Antonio Andelic](https://github.com/antonio2368)). +* Remove 02707_complex_query_fails_analyzer test [#49403](https://github.com/ClickHouse/ClickHouse/pull/49403) ([Dmitry Novik](https://github.com/novikd)). +* Update FileSegment.cpp [#49411](https://github.com/ClickHouse/ClickHouse/pull/49411) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Switch Block::NameMap to google::dense_hash_map over HashMap [#49412](https://github.com/ClickHouse/ClickHouse/pull/49412) ([Azat Khuzhin](https://github.com/azat)). +* Slightly reduce inter-header dependencies [#49413](https://github.com/ClickHouse/ClickHouse/pull/49413) ([Azat Khuzhin](https://github.com/azat)). +* Update WithFileName.cpp [#49414](https://github.com/ClickHouse/ClickHouse/pull/49414) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix some assertions failing in stress test [#49415](https://github.com/ClickHouse/ClickHouse/pull/49415) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Correctly cleanup sequential node in ZooKeeperWithFaultInjection [#49418](https://github.com/ClickHouse/ClickHouse/pull/49418) ([vdimir](https://github.com/vdimir)). +* Throw an exception for non-parametric functions in new analyzer [#49419](https://github.com/ClickHouse/ClickHouse/pull/49419) ([Dmitry Novik](https://github.com/novikd)). +* Fix some bad error messages [#49420](https://github.com/ClickHouse/ClickHouse/pull/49420) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Update version_date.tsv and changelogs after v23.4.2.11-stable [#49422](https://github.com/ClickHouse/ClickHouse/pull/49422) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Remove trash [#49423](https://github.com/ClickHouse/ClickHouse/pull/49423) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Whitespaces [#49424](https://github.com/ClickHouse/ClickHouse/pull/49424) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Remove dependency from DB::Context in remote/cache readers [#49426](https://github.com/ClickHouse/ClickHouse/pull/49426) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Merging [#49066](https://github.com/ClickHouse/ClickHouse/issues/49066) (Better error handling during loading of parts) [#49430](https://github.com/ClickHouse/ClickHouse/pull/49430) ([Anton Popov](https://github.com/CurtizJ)). +* all s3-blobs removed when merge aborted, remove part from failed fetch without unlock keper [#49432](https://github.com/ClickHouse/ClickHouse/pull/49432) ([Sema Checherinda](https://github.com/CheSema)). +* Make INSERT do more things in parallel to avoid getting bottlenecked on one thread [#49434](https://github.com/ClickHouse/ClickHouse/pull/49434) ([Michael Kolupaev](https://github.com/al13n321)). +* Make 'exceptions shorter than 30' test less noisy [#49435](https://github.com/ClickHouse/ClickHouse/pull/49435) ([Michael Kolupaev](https://github.com/al13n321)). +* Build fixes for ENABLE_LIBRARIES=OFF [#49437](https://github.com/ClickHouse/ClickHouse/pull/49437) ([Azat Khuzhin](https://github.com/azat)). +* Add image for docker-server jepsen [#49452](https://github.com/ClickHouse/ClickHouse/pull/49452) ([alesapin](https://github.com/alesapin)). +* Follow-up to [#48792](https://github.com/ClickHouse/ClickHouse/issues/48792) [#49458](https://github.com/ClickHouse/ClickHouse/pull/49458) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Add method `getCurrentAvailabilityZone` to `AWSEC2MetadataClient` [#49464](https://github.com/ClickHouse/ClickHouse/pull/49464) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Add an integration test for `shutdown_wait_unfinished_queries` [#49469](https://github.com/ClickHouse/ClickHouse/pull/49469) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Replace `NO DELAY` with `SYNC` in tests [#49470](https://github.com/ClickHouse/ClickHouse/pull/49470) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Check the PRs body directly in lambda, without rerun. Fix RCE in the CI [#49475](https://github.com/ClickHouse/ClickHouse/pull/49475) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Minor changes for setThreadName [#49476](https://github.com/ClickHouse/ClickHouse/pull/49476) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Static cast std::atomic to uint64_t to serialize. [#49482](https://github.com/ClickHouse/ClickHouse/pull/49482) ([alekar](https://github.com/alekar)). +* Fix logical error in stress test, add some logging [#49491](https://github.com/ClickHouse/ClickHouse/pull/49491) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fixes in server jepsen image [#49492](https://github.com/ClickHouse/ClickHouse/pull/49492) ([alesapin](https://github.com/alesapin)). +* Fix UserTimeMicroseconds and SystemTimeMicroseconds descriptions [#49521](https://github.com/ClickHouse/ClickHouse/pull/49521) ([Sergei Trifonov](https://github.com/serxa)). +* Remove garbage from HDFS [#49531](https://github.com/ClickHouse/ClickHouse/pull/49531) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Split ReadWriteBufferFromHTTP.h into .h and .cpp file [#49533](https://github.com/ClickHouse/ClickHouse/pull/49533) ([Michael Kolupaev](https://github.com/al13n321)). +* Remove garbage from Pretty format [#49534](https://github.com/ClickHouse/ClickHouse/pull/49534) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Make input_format_parquet_preserve_order imply !parallelize_output_from_storages [#49536](https://github.com/ClickHouse/ClickHouse/pull/49536) ([Michael Kolupaev](https://github.com/al13n321)). +* Remove extra semicolons [#49545](https://github.com/ClickHouse/ClickHouse/pull/49545) ([Bulat Gaifullin](https://github.com/bgaifullin)). +* Fix 00597_push_down_predicate_long for analyzer [#49551](https://github.com/ClickHouse/ClickHouse/pull/49551) ([Igor Nikonov](https://github.com/devcrafter)). +* Fix stress test (assertion 'key_metadata.lock()') [#49554](https://github.com/ClickHouse/ClickHouse/pull/49554) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix writeAnyEscapedString if quote_character is a meta character [#49558](https://github.com/ClickHouse/ClickHouse/pull/49558) ([Robert Schulze](https://github.com/rschu1ze)). +* Add CMake option for BOOST_USE_UCONTEXT [#49564](https://github.com/ClickHouse/ClickHouse/pull/49564) ([ltrk2](https://github.com/ltrk2)). +* Fix 01655_plan_optimizations_optimize_read_in_window_order for analyzer [#49565](https://github.com/ClickHouse/ClickHouse/pull/49565) ([Igor Nikonov](https://github.com/devcrafter)). +* Fix `ThreadPool::wait` [#49572](https://github.com/ClickHouse/ClickHouse/pull/49572) ([Anton Popov](https://github.com/CurtizJ)). +* Query cache: disable for internal queries [#49573](https://github.com/ClickHouse/ClickHouse/pull/49573) ([Robert Schulze](https://github.com/rschu1ze)). +* Remove `test_merge_tree_s3_restore` [#49576](https://github.com/ClickHouse/ClickHouse/pull/49576) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix bad test [#49578](https://github.com/ClickHouse/ClickHouse/pull/49578) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Remove obsolete test about deprecated feature [#49579](https://github.com/ClickHouse/ClickHouse/pull/49579) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Avoid error found by AST Fuzzer [#49580](https://github.com/ClickHouse/ClickHouse/pull/49580) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix wrong assert [#49581](https://github.com/ClickHouse/ClickHouse/pull/49581) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Flaky test 02723_zookeeper_name.sql [#49592](https://github.com/ClickHouse/ClickHouse/pull/49592) ([Sema Checherinda](https://github.com/CheSema)). +* Query Cache: Safeguard against empty chunks [#49593](https://github.com/ClickHouse/ClickHouse/pull/49593) ([Robert Schulze](https://github.com/rschu1ze)). +* 02723_zookeeper_name: Force a deterministic result order [#49594](https://github.com/ClickHouse/ClickHouse/pull/49594) ([Robert Schulze](https://github.com/rschu1ze)). +* Remove dangerous code (stringstream) [#49595](https://github.com/ClickHouse/ClickHouse/pull/49595) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Remove some code [#49596](https://github.com/ClickHouse/ClickHouse/pull/49596) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Remove "locale" [#49597](https://github.com/ClickHouse/ClickHouse/pull/49597) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* CMake: Cleanup utils build [#49598](https://github.com/ClickHouse/ClickHouse/pull/49598) ([Robert Schulze](https://github.com/rschu1ze)). +* Follow-up for [#49580](https://github.com/ClickHouse/ClickHouse/issues/49580) [#49604](https://github.com/ClickHouse/ClickHouse/pull/49604) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix typo [#49605](https://github.com/ClickHouse/ClickHouse/pull/49605) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix bad test 01660_system_parts_smoke [#49611](https://github.com/ClickHouse/ClickHouse/pull/49611) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Minor changes [#49612](https://github.com/ClickHouse/ClickHouse/pull/49612) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Follow-up for [#49576](https://github.com/ClickHouse/ClickHouse/issues/49576) [#49615](https://github.com/ClickHouse/ClickHouse/pull/49615) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix error in [#48300](https://github.com/ClickHouse/ClickHouse/issues/48300) [#49616](https://github.com/ClickHouse/ClickHouse/pull/49616) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix typo: "as much slots" -> "as many slots" [#49617](https://github.com/ClickHouse/ClickHouse/pull/49617) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Better concurrent parts removal with zero copy [#49619](https://github.com/ClickHouse/ClickHouse/pull/49619) ([alesapin](https://github.com/alesapin)). +* CMake: Remove legacy switch for ccache [#49627](https://github.com/ClickHouse/ClickHouse/pull/49627) ([Robert Schulze](https://github.com/rschu1ze)). +* Try to fix integration test 'test_ssl_cert_authentication' [#49632](https://github.com/ClickHouse/ClickHouse/pull/49632) ([Nikolay Degterinsky](https://github.com/evillique)). +* Unflake 01660_system_parts_smoke [#49633](https://github.com/ClickHouse/ClickHouse/pull/49633) ([Robert Schulze](https://github.com/rschu1ze)). +* Add trash [#49634](https://github.com/ClickHouse/ClickHouse/pull/49634) ([Robert Schulze](https://github.com/rschu1ze)). +* Remove commented code [#49635](https://github.com/ClickHouse/ClickHouse/pull/49635) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add flaky test [#49646](https://github.com/ClickHouse/ClickHouse/pull/49646) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix race in `Context::createCopy` [#49663](https://github.com/ClickHouse/ClickHouse/pull/49663) ([Anton Popov](https://github.com/CurtizJ)). +* Disable 01710_projection_aggregation_in_order.sql [#49667](https://github.com/ClickHouse/ClickHouse/pull/49667) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix flaky 02684_bson.sql [#49674](https://github.com/ClickHouse/ClickHouse/pull/49674) ([Kruglov Pavel](https://github.com/Avogar)). +* Some cache cleanup after rework locking [#49675](https://github.com/ClickHouse/ClickHouse/pull/49675) ([Igor Nikonov](https://github.com/devcrafter)). +* Correctly update log pointer during database replica recovery [#49676](https://github.com/ClickHouse/ClickHouse/pull/49676) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Enable distinct in order after fix [#49636](https://github.com/ClickHouse/ClickHouse/issues/49636) [#49677](https://github.com/ClickHouse/ClickHouse/pull/49677) ([Igor Nikonov](https://github.com/devcrafter)). +* Build fixes for RISCV64 [#49688](https://github.com/ClickHouse/ClickHouse/pull/49688) ([Azat Khuzhin](https://github.com/azat)). +* Add some logging [#49690](https://github.com/ClickHouse/ClickHouse/pull/49690) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix a wrong built generator removal, use `depth=1` [#49692](https://github.com/ClickHouse/ClickHouse/pull/49692) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix member call on null pointer in AST fuzzer [#49696](https://github.com/ClickHouse/ClickHouse/pull/49696) ([Nikolay Degterinsky](https://github.com/evillique)). +* Improve woboq codebrowser pipeline [#49701](https://github.com/ClickHouse/ClickHouse/pull/49701) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Enable `do_not_evict_index_and_mark_files` by default [#49702](https://github.com/ClickHouse/ClickHouse/pull/49702) ([Nikita Taranov](https://github.com/nickitat)). +* Backport fix for UBSan error in musl/logf.c [#49705](https://github.com/ClickHouse/ClickHouse/pull/49705) ([Nikita Taranov](https://github.com/nickitat)). +* Fix flaky test for `kolmogorovSmirnovTest` function [#49710](https://github.com/ClickHouse/ClickHouse/pull/49710) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Update clickhouse-test [#49712](https://github.com/ClickHouse/ClickHouse/pull/49712) ([Alexander Tokmakov](https://github.com/tavplubix)). +* IBM s390x: ip encoding fix [#49713](https://github.com/ClickHouse/ClickHouse/pull/49713) ([Suzy Wang](https://github.com/SuzyWangIBMer)). +* Remove not used ErrorCodes [#49715](https://github.com/ClickHouse/ClickHouse/pull/49715) ([Sergei Trifonov](https://github.com/serxa)). +* Disable mmap for StorageFile in clickhouse-server [#49717](https://github.com/ClickHouse/ClickHouse/pull/49717) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix typo [#49718](https://github.com/ClickHouse/ClickHouse/pull/49718) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Do not launch workflows for PRs w/o "can be tested" [#49726](https://github.com/ClickHouse/ClickHouse/pull/49726) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Move assertions after logging [#49729](https://github.com/ClickHouse/ClickHouse/pull/49729) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Docs: Fix sidebar label for dictionary table function [#49730](https://github.com/ClickHouse/ClickHouse/pull/49730) ([Robert Schulze](https://github.com/rschu1ze)). +* Do not allocate own buffer in CachedOnDiskReadBufferFromFile when `use_external_buffer == true` [#49733](https://github.com/ClickHouse/ClickHouse/pull/49733) ([Nikita Taranov](https://github.com/nickitat)). +* fix convertation [#49749](https://github.com/ClickHouse/ClickHouse/pull/49749) ([Sema Checherinda](https://github.com/CheSema)). +* fix flaky test 02504_regexp_dictionary_ua_parser [#49753](https://github.com/ClickHouse/ClickHouse/pull/49753) ([Han Fei](https://github.com/hanfei1991)). +* Fix unit test `ExceptionFromWait` [#49755](https://github.com/ClickHouse/ClickHouse/pull/49755) ([Anton Popov](https://github.com/CurtizJ)). +* Add forgotten lock (addition to [#49117](https://github.com/ClickHouse/ClickHouse/issues/49117)) [#49757](https://github.com/ClickHouse/ClickHouse/pull/49757) ([Anton Popov](https://github.com/CurtizJ)). +* Fix typo [#49762](https://github.com/ClickHouse/ClickHouse/pull/49762) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix build of `libfiu` on clang-16 [#49766](https://github.com/ClickHouse/ClickHouse/pull/49766) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Update README.md [#49782](https://github.com/ClickHouse/ClickHouse/pull/49782) ([Tyler Hannan](https://github.com/tylerhannan)). +* Analyzer: fix column not found for optimized prewhere with sample by [#49784](https://github.com/ClickHouse/ClickHouse/pull/49784) ([vdimir](https://github.com/vdimir)). +* Typo: demange.cpp --> demangle.cpp [#49799](https://github.com/ClickHouse/ClickHouse/pull/49799) ([Robert Schulze](https://github.com/rschu1ze)). +* Analyzer: apply _CAST to constants only once [#49800](https://github.com/ClickHouse/ClickHouse/pull/49800) ([Dmitry Novik](https://github.com/novikd)). +* Use CLOCK_MONOTONIC_RAW over CLOCK_MONOTONIC on Linux (fixes non monotonic clock) [#49819](https://github.com/ClickHouse/ClickHouse/pull/49819) ([Azat Khuzhin](https://github.com/azat)). +* README.md: 4 --> 5 [#49822](https://github.com/ClickHouse/ClickHouse/pull/49822) ([Robert Schulze](https://github.com/rschu1ze)). +* Allow ASOF JOIN over nullable right column [#49826](https://github.com/ClickHouse/ClickHouse/pull/49826) ([vdimir](https://github.com/vdimir)). +* Make 01533_multiple_nested test more reliable [#49828](https://github.com/ClickHouse/ClickHouse/pull/49828) ([Salvatore Mesoraca](https://github.com/aiven-sal)). +* What happens if I remove everything in msan_suppressions? [#49829](https://github.com/ClickHouse/ClickHouse/pull/49829) ([Robert Schulze](https://github.com/rschu1ze)). +* Update README.md [#49832](https://github.com/ClickHouse/ClickHouse/pull/49832) ([AnneClickHouse](https://github.com/AnneClickHouse)). +* Randomize enable_multiple_prewhere_read_steps setting [#49834](https://github.com/ClickHouse/ClickHouse/pull/49834) ([Alexander Gololobov](https://github.com/davenger)). +* Analyzer: do not optimize GROUP BY keys with ROLLUP and CUBE [#49838](https://github.com/ClickHouse/ClickHouse/pull/49838) ([Dmitry Novik](https://github.com/novikd)). +* Clearable hash table and zero values [#49846](https://github.com/ClickHouse/ClickHouse/pull/49846) ([Igor Nikonov](https://github.com/devcrafter)). +* Reset vectorscan reference to an "official" repo [#49848](https://github.com/ClickHouse/ClickHouse/pull/49848) ([Robert Schulze](https://github.com/rschu1ze)). +* Enable few slow clang-tidy checks for clangd [#49855](https://github.com/ClickHouse/ClickHouse/pull/49855) ([Azat Khuzhin](https://github.com/azat)). +* Update QPL docs [#49857](https://github.com/ClickHouse/ClickHouse/pull/49857) ([Robert Schulze](https://github.com/rschu1ze)). +* Small-ish .clang-tidy update [#49859](https://github.com/ClickHouse/ClickHouse/pull/49859) ([Robert Schulze](https://github.com/rschu1ze)). +* Follow-up for clang-tidy [#49861](https://github.com/ClickHouse/ClickHouse/pull/49861) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix "reference to local binding" after fixes for clang-17 [#49874](https://github.com/ClickHouse/ClickHouse/pull/49874) ([Azat Khuzhin](https://github.com/azat)). +* fix typo [#49876](https://github.com/ClickHouse/ClickHouse/pull/49876) ([JackyWoo](https://github.com/JackyWoo)). +* Log with warning if the server was terminated forcefully [#49881](https://github.com/ClickHouse/ClickHouse/pull/49881) ([Azat Khuzhin](https://github.com/azat)). +* Fix some tests [#49889](https://github.com/ClickHouse/ClickHouse/pull/49889) ([Alexander Tokmakov](https://github.com/tavplubix)). +* use chassert in MergeTreeDeduplicationLog to have better log info [#49891](https://github.com/ClickHouse/ClickHouse/pull/49891) ([Han Fei](https://github.com/hanfei1991)). +* Multiple pools support for AsyncLoader [#49893](https://github.com/ClickHouse/ClickHouse/pull/49893) ([Sergei Trifonov](https://github.com/serxa)). +* Fix stack-use-after-scope in resource manager test [#49908](https://github.com/ClickHouse/ClickHouse/pull/49908) ([Sergei Trifonov](https://github.com/serxa)). +* Retry connection expired in test_rename_column/test.py [#49911](https://github.com/ClickHouse/ClickHouse/pull/49911) ([alesapin](https://github.com/alesapin)). +* Try to fix flaky test_distributed_load_balancing tests [#49912](https://github.com/ClickHouse/ClickHouse/pull/49912) ([Kruglov Pavel](https://github.com/Avogar)). +* Remove unused code [#49918](https://github.com/ClickHouse/ClickHouse/pull/49918) ([alesapin](https://github.com/alesapin)). +* Fix flakiness of test_distributed_load_balancing test [#49921](https://github.com/ClickHouse/ClickHouse/pull/49921) ([Azat Khuzhin](https://github.com/azat)). +* Add some logging [#49925](https://github.com/ClickHouse/ClickHouse/pull/49925) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Support hardlinking parts transactionally [#49931](https://github.com/ClickHouse/ClickHouse/pull/49931) ([Michael Kolupaev](https://github.com/al13n321)). +* Fix for analyzer: 02377_ optimize_sorting_by_input_stream_properties_e… [#49943](https://github.com/ClickHouse/ClickHouse/pull/49943) ([Igor Nikonov](https://github.com/devcrafter)). +* Follow up to [#49429](https://github.com/ClickHouse/ClickHouse/issues/49429) [#49964](https://github.com/ClickHouse/ClickHouse/pull/49964) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix flaky test_ssl_cert_authentication to use urllib3 [#49982](https://github.com/ClickHouse/ClickHouse/pull/49982) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix woboq codebrowser build with -Wno-poison-system-directories [#49992](https://github.com/ClickHouse/ClickHouse/pull/49992) ([Azat Khuzhin](https://github.com/azat)). +* test for [#46128](https://github.com/ClickHouse/ClickHouse/issues/46128) [#49993](https://github.com/ClickHouse/ClickHouse/pull/49993) ([Denny Crane](https://github.com/den-crane)). +* Fix test_insert_same_partition_and_merge failing if one Azure request attempt fails [#49996](https://github.com/ClickHouse/ClickHouse/pull/49996) ([Michael Kolupaev](https://github.com/al13n321)). +* Check return value of `ftruncate` in Keeper [#50020](https://github.com/ClickHouse/ClickHouse/pull/50020) ([Antonio Andelic](https://github.com/antonio2368)). +* Add some assertions [#50025](https://github.com/ClickHouse/ClickHouse/pull/50025) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Update 02441_alter_delete_and_drop_column.sql [#50027](https://github.com/ClickHouse/ClickHouse/pull/50027) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Move some common code to common [#50028](https://github.com/ClickHouse/ClickHouse/pull/50028) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Add method getCredentials() to S3::Client [#50030](https://github.com/ClickHouse/ClickHouse/pull/50030) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Update query_log.md [#50032](https://github.com/ClickHouse/ClickHouse/pull/50032) ([Sergei Trifonov](https://github.com/serxa)). +* Get rid of indirect write buffer in object storages [#50033](https://github.com/ClickHouse/ClickHouse/pull/50033) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Load balancing bugfixes [#50036](https://github.com/ClickHouse/ClickHouse/pull/50036) ([Sergei Trifonov](https://github.com/serxa)). +* Update S3 sdk to v1.11.61 [#50037](https://github.com/ClickHouse/ClickHouse/pull/50037) ([Nikita Taranov](https://github.com/nickitat)). +* Fix 02735_system_zookeeper_connection for DatabaseReplicated [#50047](https://github.com/ClickHouse/ClickHouse/pull/50047) ([Azat Khuzhin](https://github.com/azat)). +* Add more profile events for distributed connections [#50051](https://github.com/ClickHouse/ClickHouse/pull/50051) ([Sergei Trifonov](https://github.com/serxa)). +* FileCache: simple tryReserve() cleanup [#50059](https://github.com/ClickHouse/ClickHouse/pull/50059) ([Igor Nikonov](https://github.com/devcrafter)). +* Fix hashed/sparse_hashed dictionaries max_load_factor upper range [#50065](https://github.com/ClickHouse/ClickHouse/pull/50065) ([Azat Khuzhin](https://github.com/azat)). +* Clearer coordinator log [#50101](https://github.com/ClickHouse/ClickHouse/pull/50101) ([Raúl Marín](https://github.com/Algunenano)). +* Analyzer: Do not execute table functions multiple times [#50105](https://github.com/ClickHouse/ClickHouse/pull/50105) ([Dmitry Novik](https://github.com/novikd)). +* Update default settings for Replicated database [#50108](https://github.com/ClickHouse/ClickHouse/pull/50108) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Make async prefetched buffer work with arbitrary impl [#50109](https://github.com/ClickHouse/ClickHouse/pull/50109) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Update github.com/distribution/distribution [#50114](https://github.com/ClickHouse/ClickHouse/pull/50114) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Docs: Update clickhouse-local arguments [#50138](https://github.com/ClickHouse/ClickHouse/pull/50138) ([Robert Schulze](https://github.com/rschu1ze)). +* Change fields destruction order in AsyncTaskExecutor [#50151](https://github.com/ClickHouse/ClickHouse/pull/50151) ([Kruglov Pavel](https://github.com/Avogar)). +* Follow-up to [#49889](https://github.com/ClickHouse/ClickHouse/issues/49889) [#50152](https://github.com/ClickHouse/ClickHouse/pull/50152) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Clarification comment on retries controller behavior [#50155](https://github.com/ClickHouse/ClickHouse/pull/50155) ([Igor Nikonov](https://github.com/devcrafter)). +* Switch to upstream repository of vectorscan [#50159](https://github.com/ClickHouse/ClickHouse/pull/50159) ([Azat Khuzhin](https://github.com/azat)). +* Refactor lambdas, prepare to prio runners [#50160](https://github.com/ClickHouse/ClickHouse/pull/50160) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Speed-up the shellcheck with parallel xargs [#50164](https://github.com/ClickHouse/ClickHouse/pull/50164) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Update an exception message [#50180](https://github.com/ClickHouse/ClickHouse/pull/50180) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Upgrade boost submodule [#50188](https://github.com/ClickHouse/ClickHouse/pull/50188) ([ltrk2](https://github.com/ltrk2)). +* Implement a uniform way to query processor core IDs [#50190](https://github.com/ClickHouse/ClickHouse/pull/50190) ([ltrk2](https://github.com/ltrk2)). +* Don't replicate delete through DDL worker if there is just 1 shard [#50193](https://github.com/ClickHouse/ClickHouse/pull/50193) ([Alexander Gololobov](https://github.com/davenger)). +* Fix codebrowser by using clang-15 image [#50197](https://github.com/ClickHouse/ClickHouse/pull/50197) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Add comments to build reports [#50200](https://github.com/ClickHouse/ClickHouse/pull/50200) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Automatic backports of important fixes to cloud-release [#50202](https://github.com/ClickHouse/ClickHouse/pull/50202) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Unify priorities: lower value means higher priority [#50205](https://github.com/ClickHouse/ClickHouse/pull/50205) ([Sergei Trifonov](https://github.com/serxa)). +* Use transactions for encrypted disks [#50206](https://github.com/ClickHouse/ClickHouse/pull/50206) ([alesapin](https://github.com/alesapin)). +* Get detailed error instead of unknown error for function test [#50207](https://github.com/ClickHouse/ClickHouse/pull/50207) ([Suzy Wang](https://github.com/SuzyWangIBMer)). +* README.md: Remove Berlin Meetup from upcoming events [#50218](https://github.com/ClickHouse/ClickHouse/pull/50218) ([Robert Schulze](https://github.com/rschu1ze)). +* Minor adjustment of clickhouse-client/local parameter docs [#50219](https://github.com/ClickHouse/ClickHouse/pull/50219) ([Robert Schulze](https://github.com/rschu1ze)). +* Unify priorities: rework IO scheduling subsystem [#50231](https://github.com/ClickHouse/ClickHouse/pull/50231) ([Sergei Trifonov](https://github.com/serxa)). +* Add new metrics BrokenDistributedBytesToInsert/DistributedBytesToInsert [#50238](https://github.com/ClickHouse/ClickHouse/pull/50238) ([Azat Khuzhin](https://github.com/azat)). +* Fix URL in backport comment [#50241](https://github.com/ClickHouse/ClickHouse/pull/50241) ([pufit](https://github.com/pufit)). +* Fix `02535_max_parallel_replicas_custom_key` [#50242](https://github.com/ClickHouse/ClickHouse/pull/50242) ([Antonio Andelic](https://github.com/antonio2368)). +* Fixes for MergeTree with readonly disks [#50244](https://github.com/ClickHouse/ClickHouse/pull/50244) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Yet another refactoring [#50257](https://github.com/ClickHouse/ClickHouse/pull/50257) ([Anton Popov](https://github.com/CurtizJ)). +* Unify priorities: rework AsyncLoader [#50272](https://github.com/ClickHouse/ClickHouse/pull/50272) ([Sergei Trifonov](https://github.com/serxa)). +* buffers d-tor finalize free [#50275](https://github.com/ClickHouse/ClickHouse/pull/50275) ([Sema Checherinda](https://github.com/CheSema)). +* Fix 02767_into_outfile_extensions_msan under analyzer [#50290](https://github.com/ClickHouse/ClickHouse/pull/50290) ([Azat Khuzhin](https://github.com/azat)). +* QPL: Add a comment about isal [#50308](https://github.com/ClickHouse/ClickHouse/pull/50308) ([Robert Schulze](https://github.com/rschu1ze)). +* Avoid clang 15 crash [#50310](https://github.com/ClickHouse/ClickHouse/pull/50310) ([Raúl Marín](https://github.com/Algunenano)). +* Cleanup Annoy index [#50312](https://github.com/ClickHouse/ClickHouse/pull/50312) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix flaky `AsyncLoader.StaticPriorities` unit test [#50313](https://github.com/ClickHouse/ClickHouse/pull/50313) ([Sergei Trifonov](https://github.com/serxa)). +* Update gtest_async_loader.cpp [#50317](https://github.com/ClickHouse/ClickHouse/pull/50317) ([Nikita Taranov](https://github.com/nickitat)). +* Fix IS (NOT) NULL operator priority [#50327](https://github.com/ClickHouse/ClickHouse/pull/50327) ([Nikolay Degterinsky](https://github.com/evillique)). +* Update README.md [#50340](https://github.com/ClickHouse/ClickHouse/pull/50340) ([Tyler Hannan](https://github.com/tylerhannan)). +* do not fix the event list in test [#50342](https://github.com/ClickHouse/ClickHouse/pull/50342) ([Sema Checherinda](https://github.com/CheSema)). +* less logs in WriteBufferFromS3 [#50347](https://github.com/ClickHouse/ClickHouse/pull/50347) ([Sema Checherinda](https://github.com/CheSema)). +* Remove legacy install scripts superseded by universal.sh [#50360](https://github.com/ClickHouse/ClickHouse/pull/50360) ([Robert Schulze](https://github.com/rschu1ze)). +* Fail perf tests when too many queries slowed down [#50361](https://github.com/ClickHouse/ClickHouse/pull/50361) ([Nikita Taranov](https://github.com/nickitat)). +* Fix after [#50109](https://github.com/ClickHouse/ClickHouse/issues/50109) [#50362](https://github.com/ClickHouse/ClickHouse/pull/50362) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix log message [#50363](https://github.com/ClickHouse/ClickHouse/pull/50363) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Compare functions NaN update test [#50366](https://github.com/ClickHouse/ClickHouse/pull/50366) ([Maksim Kita](https://github.com/kitaisreal)). +* Add re-creation for cherry-pick PRs [#50373](https://github.com/ClickHouse/ClickHouse/pull/50373) ([pufit](https://github.com/pufit)). +* Without applying `prepareRightBlock` will cause mismatch block structrue [#50383](https://github.com/ClickHouse/ClickHouse/pull/50383) ([lgbo](https://github.com/lgbo-ustc)). +* fix hung in unit tests [#50391](https://github.com/ClickHouse/ClickHouse/pull/50391) ([Sema Checherinda](https://github.com/CheSema)). +* Fix poll timeout in MaterializedMySQL [#50392](https://github.com/ClickHouse/ClickHouse/pull/50392) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Compile aggregate expressions enable by default [#50401](https://github.com/ClickHouse/ClickHouse/pull/50401) ([Maksim Kita](https://github.com/kitaisreal)). +* Update app.py [#50407](https://github.com/ClickHouse/ClickHouse/pull/50407) ([Nikita Taranov](https://github.com/nickitat)). +* reuse s3_mocks, rewrite test test_paranoid_check_in_logs [#50408](https://github.com/ClickHouse/ClickHouse/pull/50408) ([Sema Checherinda](https://github.com/CheSema)). +* test for [#42610](https://github.com/ClickHouse/ClickHouse/issues/42610) [#50409](https://github.com/ClickHouse/ClickHouse/pull/50409) ([Denny Crane](https://github.com/den-crane)). +* Remove something [#50411](https://github.com/ClickHouse/ClickHouse/pull/50411) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Mark the builds without results as pending [#50415](https://github.com/ClickHouse/ClickHouse/pull/50415) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Revert "Fix msan issue in keyed siphash" [#50426](https://github.com/ClickHouse/ClickHouse/pull/50426) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Revert "Revert "less logs in WriteBufferFromS3" ([#50390](https://github.com/ClickHouse/ClickHouse/issues/50390))" [#50444](https://github.com/ClickHouse/ClickHouse/pull/50444) ([Sema Checherinda](https://github.com/CheSema)). +* Paranoid fix for removing parts from ZooKeeper [#50448](https://github.com/ClickHouse/ClickHouse/pull/50448) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Add timeout for unit tests [#50449](https://github.com/ClickHouse/ClickHouse/pull/50449) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Changes related to an internal feature [#50453](https://github.com/ClickHouse/ClickHouse/pull/50453) ([Michael Kolupaev](https://github.com/al13n321)). +* Don't crash if config doesn't have logger section [#50455](https://github.com/ClickHouse/ClickHouse/pull/50455) ([Michael Kolupaev](https://github.com/al13n321)). +* Update function docs [#50466](https://github.com/ClickHouse/ClickHouse/pull/50466) ([Robert Schulze](https://github.com/rschu1ze)). +* Revert "make filter push down through cross join" [#50467](https://github.com/ClickHouse/ClickHouse/pull/50467) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Add some assertions [#50470](https://github.com/ClickHouse/ClickHouse/pull/50470) ([Kseniia Sumarokova](https://github.com/kssenii)). +* CI: Enable aspell on nested docs [#50476](https://github.com/ClickHouse/ClickHouse/pull/50476) ([Robert Schulze](https://github.com/rschu1ze)). +* Try fix flaky test test_async_query_sending [#50480](https://github.com/ClickHouse/ClickHouse/pull/50480) ([Kruglov Pavel](https://github.com/Avogar)). +* Disable 00534_functions_bad_arguments with msan [#50481](https://github.com/ClickHouse/ClickHouse/pull/50481) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Typos: Follow-up to [#50476](https://github.com/ClickHouse/ClickHouse/issues/50476) [#50482](https://github.com/ClickHouse/ClickHouse/pull/50482) ([Robert Schulze](https://github.com/rschu1ze)). +* Remove unneeded Keeper test [#50485](https://github.com/ClickHouse/ClickHouse/pull/50485) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix KeyError in cherry-pick [#50493](https://github.com/ClickHouse/ClickHouse/pull/50493) ([pufit](https://github.com/pufit)). +* Make typeid_cast for pointers noexcept [#50495](https://github.com/ClickHouse/ClickHouse/pull/50495) ([Sergey Kazmin ](https://github.com/yerseg)). +* less traces in logs [#50518](https://github.com/ClickHouse/ClickHouse/pull/50518) ([Sema Checherinda](https://github.com/CheSema)). +* Implement endianness-independent serialization for UUID [#50519](https://github.com/ClickHouse/ClickHouse/pull/50519) ([ltrk2](https://github.com/ltrk2)). +* Remove strange object storage methods [#50521](https://github.com/ClickHouse/ClickHouse/pull/50521) ([alesapin](https://github.com/alesapin)). +* Fix low quality code around metadata in RocksDB (experimental feature never used in production) [#50527](https://github.com/ClickHouse/ClickHouse/pull/50527) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Function if constant folding [#50529](https://github.com/ClickHouse/ClickHouse/pull/50529) ([Maksim Kita](https://github.com/kitaisreal)). +* Add profile events for fs cache eviction [#50533](https://github.com/ClickHouse/ClickHouse/pull/50533) ([Kseniia Sumarokova](https://github.com/kssenii)). +* QueryNode small fix [#50535](https://github.com/ClickHouse/ClickHouse/pull/50535) ([Maksim Kita](https://github.com/kitaisreal)). +* Control memory usage in generateRandom [#50538](https://github.com/ClickHouse/ClickHouse/pull/50538) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Disable skim (Rust library) under memory sanitizer [#50539](https://github.com/ClickHouse/ClickHouse/pull/50539) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* MSan support for Rust [#50541](https://github.com/ClickHouse/ClickHouse/pull/50541) ([Azat Khuzhin](https://github.com/azat)). +* Make 01565_query_loop_after_client_error slightly more robust [#50542](https://github.com/ClickHouse/ClickHouse/pull/50542) ([Azat Khuzhin](https://github.com/azat)). +* Resize BufferFromVector underlying vector only pos_offset == vector.size() [#50546](https://github.com/ClickHouse/ClickHouse/pull/50546) ([auxten](https://github.com/auxten)). +* Add async iteration to object storage [#50548](https://github.com/ClickHouse/ClickHouse/pull/50548) ([alesapin](https://github.com/alesapin)). +* skip extracting darwin toolchain in builder when unncessary [#50550](https://github.com/ClickHouse/ClickHouse/pull/50550) ([SuperDJY](https://github.com/cmsxbc)). +* Remove flaky test [#50558](https://github.com/ClickHouse/ClickHouse/pull/50558) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Revert "Disable skim (Rust library) under memory sanitizer" [#50574](https://github.com/ClickHouse/ClickHouse/pull/50574) ([Azat Khuzhin](https://github.com/azat)). +* Analyzer: fix 01487_distributed_in_not_default_db [#50587](https://github.com/ClickHouse/ClickHouse/pull/50587) ([Dmitry Novik](https://github.com/novikd)). +* Fix commit for DiskObjectStorage [#50599](https://github.com/ClickHouse/ClickHouse/pull/50599) ([alesapin](https://github.com/alesapin)). +* Fix Jepsen runs in PRs [#50615](https://github.com/ClickHouse/ClickHouse/pull/50615) ([Antonio Andelic](https://github.com/antonio2368)). +* Revert incorrect optimizations [#50629](https://github.com/ClickHouse/ClickHouse/pull/50629) ([Raúl Marín](https://github.com/Algunenano)). +* Disable 01676_clickhouse_client_autocomplete under UBSan [#50636](https://github.com/ClickHouse/ClickHouse/pull/50636) ([Nikita Taranov](https://github.com/nickitat)). +* Merging [#50329](https://github.com/ClickHouse/ClickHouse/issues/50329) [#50660](https://github.com/ClickHouse/ClickHouse/pull/50660) ([Anton Popov](https://github.com/CurtizJ)). +* Revert "date_trunc function to always return DateTime type" [#50670](https://github.com/ClickHouse/ClickHouse/pull/50670) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Fix flaky test 02461_prewhere_row_level_policy_lightweight_delete [#50674](https://github.com/ClickHouse/ClickHouse/pull/50674) ([Alexander Gololobov](https://github.com/davenger)). +* Fix asan issue with analyzer and prewhere [#50685](https://github.com/ClickHouse/ClickHouse/pull/50685) ([Alexander Gololobov](https://github.com/davenger)). +* Catch issues with dockerd during the build [#50700](https://github.com/ClickHouse/ClickHouse/pull/50700) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Temporarily disable annoy index tests (flaky for analyzer) [#50714](https://github.com/ClickHouse/ClickHouse/pull/50714) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix assertion from stress test [#50718](https://github.com/ClickHouse/ClickHouse/pull/50718) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix flaky unit test [#50719](https://github.com/ClickHouse/ClickHouse/pull/50719) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Show correct sharing state in system.query_cache [#50728](https://github.com/ClickHouse/ClickHouse/pull/50728) ([Robert Schulze](https://github.com/rschu1ze)). + diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 653a0cd5388..9704c68be54 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,3 +1,4 @@ +v23.5.1.3174-stable 2023-06-09 v23.4.2.11-stable 2023-05-02 v23.4.1.1943-stable 2023-04-27 v23.3.2.37-lts 2023-04-22 From b740a08b6e508ccee08efc7e2ca83e8d7192e3ef Mon Sep 17 00:00:00 2001 From: Victor Krasnov Date: Fri, 9 Jun 2023 15:05:23 +0300 Subject: [PATCH 0703/1072] Fix the docs --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index dad3cfb4cc5..8e186844c93 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1343,7 +1343,7 @@ parseDateTimeBestEffort(time_string [, time_zone]) - A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `DD/MM/YYYY`, `DD-MM-YY` etc. - A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case `YYYY-MM` are substituted as `2000-01`. - A string that includes the date and time along with time zone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. -- A string that includes the date and time in a [syslog](https://datatracker.ietf.org/doc/html/rfc3164) datetime format. For example, `Jun 9 14:20:32`. +- A string that includes the date and time in the [syslog timestamp](https://datatracker.ietf.org/doc/html/rfc3164) format. For example, `Jun 9 14:20:32`. For all of the formats with separator the function parses months names expressed by their full name or by the first three letters of a month name. Examples: `24/DEC/18`, `24-Dec-18`, `01-September-2018`. diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 03e3adfbdca..93ca6b410c8 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -1022,7 +1022,7 @@ parseDateTimeBestEffort(time_string[, time_zone]) - Строка с датой, но без времени: `YYYY`, `YYYYMM`, `YYYY*MM`, `DD/MM/YYYY`, `DD-MM-YY` и т.д. - Строка с временем, и с днём: `DD`, `DD hh`, `DD hh:mm`. В этом случае `YYYY-MM` принимается равным `2000-01`. - Строка, содержащая дату и время вместе с информацией о часовом поясе: `YYYY-MM-DD hh:mm:ss ±h:mm`, и т.д. Например, `2020-12-12 17:36:00 -5:00`. -- Строка, содержащая дату и время в формате [syslog](https://datatracker.ietf.org/doc/html/rfc3164). Например, `Jun 9 14:20:32`. +- Строка, содержащая дату и время в формате [syslog timestamp](https://datatracker.ietf.org/doc/html/rfc3164). Например, `Jun 9 14:20:32`. Для всех форматов с разделителями функция распознаёт названия месяцев, выраженных в виде полного англоязычного имени месяца или в виде первых трёх символов имени месяца. Примеры: `24/DEC/18`, `24-Dec-18`, `01-September-2018`. From d0c2c1dbad9da978246fe6c9105a62a972041cd8 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Fri, 9 Jun 2023 12:06:43 +0000 Subject: [PATCH 0704/1072] Add test and reorder if's --- src/Functions/FunctionsConversion.h | 49 ++++++++++--------- .../01556_accurate_cast_or_null.reference | 1 + .../01556_accurate_cast_or_null.sql | 1 + 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index ea8efada21d..e44a3bdaa2e 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -2866,36 +2866,37 @@ private: using LeftDataType = typename Types::LeftType; using RightDataType = typename Types::RightType; - if constexpr (IsDataTypeNumber && IsDataTypeNumber) + if constexpr (IsDataTypeNumber) { - if (wrapper_cast_type == CastType::accurate) + if constexpr (IsDataTypeNumber) { - result_column = ConvertImpl::execute( - arguments, result_type, input_rows_count, AccurateConvertStrategyAdditions()); - } - else - { - result_column = ConvertImpl::execute( - arguments, result_type, input_rows_count, AccurateOrNullConvertStrategyAdditions()); + if (wrapper_cast_type == CastType::accurate) + { + result_column = ConvertImpl::execute( + arguments, result_type, input_rows_count, AccurateConvertStrategyAdditions()); + } + else + { + result_column = ConvertImpl::execute( + arguments, result_type, input_rows_count, AccurateOrNullConvertStrategyAdditions()); + } + return true; } - return true; - } - - if constexpr (IsDataTypeNumber - && (std::is_same_v || std::is_same_v)) - { - if (wrapper_cast_type == CastType::accurate) + if constexpr (std::is_same_v || std::is_same_v) { - result_column = ConvertImpl::template execute( - arguments, result_type, input_rows_count); + if (wrapper_cast_type == CastType::accurate) + { + result_column = ConvertImpl::template execute( + arguments, result_type, input_rows_count); + } + else + { + result_column = ConvertImpl::template execute( + arguments, result_type, input_rows_count); + } + return true; } - else - { - result_column = ConvertImpl::template execute( - arguments, result_type, input_rows_count); - } - return true; } return false; diff --git a/tests/queries/0_stateless/01556_accurate_cast_or_null.reference b/tests/queries/0_stateless/01556_accurate_cast_or_null.reference index 3bff125068a..31a9c37421e 100644 --- a/tests/queries/0_stateless/01556_accurate_cast_or_null.reference +++ b/tests/queries/0_stateless/01556_accurate_cast_or_null.reference @@ -40,4 +40,5 @@ \N \N 2023-05-30 +2149-06-06 1970-01-20 diff --git a/tests/queries/0_stateless/01556_accurate_cast_or_null.sql b/tests/queries/0_stateless/01556_accurate_cast_or_null.sql index 3266198d930..f00f6ef837f 100644 --- a/tests/queries/0_stateless/01556_accurate_cast_or_null.sql +++ b/tests/queries/0_stateless/01556_accurate_cast_or_null.sql @@ -47,4 +47,5 @@ SELECT accurateCastOrNull(-1, 'Date'); SELECT accurateCastOrNull(5000000000, 'Date'); SELECT accurateCastOrNull('1xxx', 'Date'); SELECT accurateCastOrNull('2023-05-30', 'Date'); +SELECT accurateCastOrNull('2180-01-01', 'Date'); SELECT accurateCastOrNull(19, 'Date'); From a21bd4ec62825172d859424be0687e127d36b4c0 Mon Sep 17 00:00:00 2001 From: Victor Krasnov Date: Fri, 9 Jun 2023 15:33:51 +0300 Subject: [PATCH 0705/1072] Elucidate the syslog case in the documentation --- docs/en/sql-reference/functions/type-conversion-functions.md | 3 ++- docs/ru/sql-reference/functions/type-conversion-functions.md | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 8e186844c93..e62cf89a6b2 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1341,11 +1341,12 @@ parseDateTimeBestEffort(time_string [, time_zone]) - A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time). - A string with a date and a time component: `YYYYMMDDhhmmss`, `DD/MM/YYYY hh:mm:ss`, `DD-MM-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. - A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `DD/MM/YYYY`, `DD-MM-YY` etc. -- A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case `YYYY-MM` are substituted as `2000-01`. +- A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case `MM` is substituted by `01`. - A string that includes the date and time along with time zone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. - A string that includes the date and time in the [syslog timestamp](https://datatracker.ietf.org/doc/html/rfc3164) format. For example, `Jun 9 14:20:32`. For all of the formats with separator the function parses months names expressed by their full name or by the first three letters of a month name. Examples: `24/DEC/18`, `24-Dec-18`, `01-September-2018`. +If the year is not specified, it is considered to be equal to the current year. If the resulting date and time happens to be ahead of the current moment even by a second, the current year is substituted by the previous one. **Returned value** diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 93ca6b410c8..6de55757b64 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -1020,11 +1020,12 @@ parseDateTimeBestEffort(time_string[, time_zone]) - [Unix timestamp](https://ru.wikipedia.org/wiki/Unix-время) в строковом представлении. 9 или 10 символов. - Строка с датой и временем: `YYYYMMDDhhmmss`, `DD/MM/YYYY hh:mm:ss`, `DD-MM-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. - Строка с датой, но без времени: `YYYY`, `YYYYMM`, `YYYY*MM`, `DD/MM/YYYY`, `DD-MM-YY` и т.д. -- Строка с временем, и с днём: `DD`, `DD hh`, `DD hh:mm`. В этом случае `YYYY-MM` принимается равным `2000-01`. +- Строка с временем, и с днём: `DD`, `DD hh`, `DD hh:mm`. В этом случае `MM` принимается равным `01`. - Строка, содержащая дату и время вместе с информацией о часовом поясе: `YYYY-MM-DD hh:mm:ss ±h:mm`, и т.д. Например, `2020-12-12 17:36:00 -5:00`. - Строка, содержащая дату и время в формате [syslog timestamp](https://datatracker.ietf.org/doc/html/rfc3164). Например, `Jun 9 14:20:32`. Для всех форматов с разделителями функция распознаёт названия месяцев, выраженных в виде полного англоязычного имени месяца или в виде первых трёх символов имени месяца. Примеры: `24/DEC/18`, `24-Dec-18`, `01-September-2018`. +Если год не указан, вместо него подставляется текущий год. Если в результате получается будущее время, хотя бы на одну секунду впереди, текущий год заменяется на прошлый. **Возвращаемое значение** From 4009b5fef1db916eb6baa265f305ef412cffc8e3 Mon Sep 17 00:00:00 2001 From: Ilya Yatsishin <2159081+qoega@users.noreply.github.com> Date: Fri, 9 Jun 2023 14:34:05 +0200 Subject: [PATCH 0706/1072] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a5b6dd85941..09c832f0c04 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ #### Upgrade Notes * Compress marks and primary key by default. It significantly reduces the cold query time. Upgrade notes: the support for compressed marks and primary key has been added in version 22.9. If you turned on compressed marks or primary key or installed version 23.5 or newer, which has compressed marks or primary key on by default, you will not be able to downgrade to version 22.8 or earlier. You can also explicitly disable compressed marks or primary keys by specifying the `compress_marks` and `compress_primary_key` settings in the `` section of the server configuration file. **Upgrade notes:** If you upgrade from versions prior to 22.9, you should either upgrade all replicas at once or disable the compression before upgrade, or upgrade through an intermediate version, where the compressed marks are supported but not enabled by default, such as 23.3. [#42587](https://github.com/ClickHouse/ClickHouse/pull/42587) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Make local object storage work consistently with s3 object storage, fix problem with append (closes [#48465](https://github.com/ClickHouse/ClickHouse/issues/48465)), make it configurable as independent storage. The change is backward incompatible because the cache on top of local object storage is not incompatible to previous versions. [#48791](https://github.com/ClickHouse/ClickHouse/pull/48791) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Make local object storage work consistently with s3 object storage, fix problem with append (closes [#48465](https://github.com/ClickHouse/ClickHouse/issues/48465)), make it configurable as independent storage. The change is backward incompatible because the cache on top of local object storage is not compatible to previous versions. [#48791](https://github.com/ClickHouse/ClickHouse/pull/48791) ([Kseniia Sumarokova](https://github.com/kssenii)). * The experimental feature "in-memory data parts" is removed. The data format is still supported, but the settings are no-op, and compact or wide parts will be used instead. This closes [#45409](https://github.com/ClickHouse/ClickHouse/issues/45409). [#49429](https://github.com/ClickHouse/ClickHouse/pull/49429) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Changed default values of settings `parallelize_output_from_storages` and `input_format_parquet_preserve_order`. This allows ClickHouse to reorder rows when reading from files (e.g. CSV or Parquet), greatly improving performance in many cases. To restore the old behavior of preserving order, use `parallelize_output_from_storages = 0`, `input_format_parquet_preserve_order = 1`. [#49479](https://github.com/ClickHouse/ClickHouse/pull/49479) ([Michael Kolupaev](https://github.com/al13n321)). * Make projections production-ready. Add the `optimize_use_projections` setting to control whether the projections will be selected for SELECT queries. The setting `allow_experimental_projection_optimization` is obsolete and does nothing. [#49719](https://github.com/ClickHouse/ClickHouse/pull/49719) ([Alexey Milovidov](https://github.com/alexey-milovidov)). From 056ca4f555fbbf4463de5be8642a2c01b6759192 Mon Sep 17 00:00:00 2001 From: jinjunzh Date: Wed, 24 May 2023 13:26:15 -0400 Subject: [PATCH 0707/1072] Add extensive testing cases for deflate qpl codec --- .../sql-reference/statements/create/table.md | 2 +- src/Client/Connection.cpp | 2 +- src/Compression/CompressionCodecDeflateQpl.h | 3 +- src/Compression/CompressionFactory.h | 4 +- .../CompressionFactoryAdditions.cpp | 14 ++-- src/Compression/ICompressionCodec.h | 3 + src/Core/Settings.h | 1 + src/Interpreters/InterpreterCreateQuery.cpp | 3 +- src/Server/TCPHandler.cpp | 2 +- src/Storages/AlterCommands.cpp | 8 +-- src/Storages/ColumnsDescription.cpp | 2 +- src/Storages/Distributed/DistributedSink.cpp | 2 +- src/Storages/TTLDescription.cpp | 2 +- .../deflateqpl_compression_by_default.xml | 11 ++++ .../configs/enable_deflateqpl_codec.xml | 7 ++ .../test_non_default_compression/test.py | 65 ++++++++++++++++++- ...04_test_alter_compression_codecs.reference | 31 ++++++--- .../00804_test_alter_compression_codecs.sql | 28 +++++--- 18 files changed, 153 insertions(+), 37 deletions(-) create mode 100644 tests/integration/test_non_default_compression/configs/deflateqpl_compression_by_default.xml create mode 100644 tests/integration/test_non_default_compression/configs/enable_deflateqpl_codec.xml diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index de44a001472..b0865ad2896 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -380,7 +380,7 @@ High compression levels are useful for asymmetric scenarios, like compress once, `DEFLATE_QPL` — [Deflate compression algorithm](https://github.com/intel/qpl) implemented by Intel® Query Processing Library. Some limitations apply: -- DEFLATE_QPL is experimental and can only be used after setting configuration parameter `allow_experimental_codecs=1`. +- DEFLATE_QPL is disabled by default and can only be used after setting configuration parameter `enable_qpl_deflate=1`. - DEFLATE_QPL requires a ClickHouse build compiled with SSE 4.2 instructions (by default, this is the case). Refer to [Build Clickhouse with DEFLATE_QPL](/docs/en/development/building_and_benchmarking_deflate_qpl.md/#Build-Clickhouse-with-DEFLATE_QPL) for more details. - DEFLATE_QPL works best if the system has a Intel® IAA (In-Memory Analytics Accelerator) offloading device. Refer to [Accelerator Configuration](https://intel.github.io/qpl/documentation/get_started_docs/installation.html#accelerator-configuration) and [Benchmark with DEFLATE_QPL](/docs/en/development/building_and_benchmarking_deflate_qpl.md/#Run-Benchmark-with-DEFLATE_QPL) for more details. - DEFLATE_QPL-compressed data can only be transferred between ClickHouse nodes compiled with SSE 4.2 enabled. diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 2350a5039ab..68bc3b39a56 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -588,7 +588,7 @@ void Connection::sendQuery( if (method == "ZSTD") level = settings->network_zstd_compression_level; - CompressionCodecFactory::instance().validateCodec(method, level, !settings->allow_suspicious_codecs, settings->allow_experimental_codecs); + CompressionCodecFactory::instance().validateCodec(method, level, !settings->allow_suspicious_codecs, settings->allow_experimental_codecs, settings->enable_qpl_deflate); compression_codec = CompressionCodecFactory::instance().get(method, level); } else diff --git a/src/Compression/CompressionCodecDeflateQpl.h b/src/Compression/CompressionCodecDeflateQpl.h index 7a1a764295d..13aa8733b54 100644 --- a/src/Compression/CompressionCodecDeflateQpl.h +++ b/src/Compression/CompressionCodecDeflateQpl.h @@ -98,7 +98,8 @@ public: protected: bool isCompression() const override { return true; } bool isGenericCompression() const override { return true; } - bool isExperimental() const override { return true; } + bool isExperimental() const override { return false; } + bool isDeflateQplCompression() const override { return true; } UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override; void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override; diff --git a/src/Compression/CompressionFactory.h b/src/Compression/CompressionFactory.h index a4451f9ed2e..1fdaf4f1c71 100644 --- a/src/Compression/CompressionFactory.h +++ b/src/Compression/CompressionFactory.h @@ -40,10 +40,10 @@ public: CompressionCodecPtr getDefaultCodec() const; /// Validate codecs AST specified by user and parses codecs description (substitute default parameters) - ASTPtr validateCodecAndGetPreprocessedAST(const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs) const; + ASTPtr validateCodecAndGetPreprocessedAST(const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_qpl_deflate) const; /// Validate codecs AST specified by user - void validateCodec(const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs) const; + void validateCodec(const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs, bool enable_qpl_deflate) const; /// Get codec by AST and possible column_type. Some codecs can use /// information about type to improve inner settings, but every codec should diff --git a/src/Compression/CompressionFactoryAdditions.cpp b/src/Compression/CompressionFactoryAdditions.cpp index 978a0fe5069..2630326238a 100644 --- a/src/Compression/CompressionFactoryAdditions.cpp +++ b/src/Compression/CompressionFactoryAdditions.cpp @@ -34,7 +34,7 @@ namespace ErrorCodes void CompressionCodecFactory::validateCodec( - const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs) const + const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs, bool enable_qpl_deflate) const { if (family_name.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Compression codec name cannot be empty"); @@ -43,13 +43,13 @@ void CompressionCodecFactory::validateCodec( { auto literal = std::make_shared(static_cast(*level)); validateCodecAndGetPreprocessedAST(makeASTFunction("CODEC", makeASTFunction(Poco::toUpper(family_name), literal)), - {}, sanity_check, allow_experimental_codecs); + {}, sanity_check, allow_experimental_codecs, enable_qpl_deflate); } else { auto identifier = std::make_shared(Poco::toUpper(family_name)); validateCodecAndGetPreprocessedAST(makeASTFunction("CODEC", identifier), - {}, sanity_check, allow_experimental_codecs); + {}, sanity_check, allow_experimental_codecs, enable_qpl_deflate); } } @@ -77,7 +77,7 @@ bool innerDataTypeIsFloat(const DataTypePtr & type) } ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST( - const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs) const + const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_qpl_deflate) const { if (const auto * func = ast->as()) { @@ -159,6 +159,12 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST( " You can enable it with the 'allow_experimental_codecs' setting.", codec_family_name); + if (!enable_qpl_deflate && result_codec->isDeflateQplCompression()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Codec {} is disabled by default." + " You can enable it with the 'enable_qpl_deflate' setting.", + codec_family_name); + codecs_descriptions->children.emplace_back(result_codec->getCodecDesc()); } diff --git a/src/Compression/ICompressionCodec.h b/src/Compression/ICompressionCodec.h index 44835ac19cb..d92ad3fc718 100644 --- a/src/Compression/ICompressionCodec.h +++ b/src/Compression/ICompressionCodec.h @@ -112,6 +112,9 @@ public: /// If it does nothing. virtual bool isNone() const { return false; } + /// This is a knob for Deflate QPL codec. + virtual bool isDeflateQplCompression() const { return false; } + protected: /// This is used for fuzz testing friend int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size); diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 464b9168a4c..c6a2069e6ae 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -319,6 +319,7 @@ class IColumn; M(Bool, allow_distributed_ddl, true, "If it is set to true, then a user is allowed to executed distributed DDL queries.", 0) \ M(Bool, allow_suspicious_codecs, false, "If it is set to true, allow to specify meaningless compression codecs.", 0) \ M(Bool, allow_experimental_codecs, false, "If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).", 0) \ + M(Bool, enable_qpl_deflate, false, "If it is set to true, allow to use deflate_qpl for compression.", 0) \ M(UInt64, query_profiler_real_time_period_ns, QUERY_PROFILER_DEFAULT_SAMPLE_RATE_NS, "Period for real clock timer of query profiler (in nanoseconds). Set 0 value to turn off the real clock query profiler. Recommended value is at least 10000000 (100 times a second) for single queries or 1000000000 (once a second) for cluster-wide profiling.", 0) \ M(UInt64, query_profiler_cpu_time_period_ns, QUERY_PROFILER_DEFAULT_SAMPLE_RATE_NS, "Period for CPU clock timer of query profiler (in nanoseconds). Set 0 value to turn off the CPU clock query profiler. Recommended value is at least 10000000 (100 times a second) for single queries or 1000000000 (once a second) for cluster-wide profiling.", 0) \ M(Bool, metrics_perf_events_enabled, false, "If enabled, some of the perf events will be measured throughout queries' execution.", 0) \ diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index ab9e1fb04d6..5c22b46b360 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -571,6 +571,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( bool sanity_check_compression_codecs = !attach && !context_->getSettingsRef().allow_suspicious_codecs; bool allow_experimental_codecs = attach || context_->getSettingsRef().allow_experimental_codecs; + bool enable_qpl_deflate = attach || context_->getSettingsRef().enable_qpl_deflate; ColumnsDescription res; auto name_type_it = column_names_and_types.begin(); @@ -631,7 +632,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( if (col_decl.default_specifier == "ALIAS") throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot specify codec for column type ALIAS"); column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST( - col_decl.codec, column.type, sanity_check_compression_codecs, allow_experimental_codecs); + col_decl.codec, column.type, sanity_check_compression_codecs, allow_experimental_codecs, enable_qpl_deflate); } if (col_decl.ttl) diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 1ded7d97248..96c585e7d16 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -1775,7 +1775,7 @@ void TCPHandler::initBlockOutput(const Block & block) if (state.compression == Protocol::Compression::Enable) { - CompressionCodecFactory::instance().validateCodec(method, level, !query_settings.allow_suspicious_codecs, query_settings.allow_experimental_codecs); + CompressionCodecFactory::instance().validateCodec(method, level, !query_settings.allow_suspicious_codecs, query_settings.allow_experimental_codecs, query_settings.enable_qpl_deflate); state.maybe_compressed_out = std::make_shared( *out, CompressionCodecFactory::instance().get(method, level)); diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 5fd823b9e01..ecbddfc3e2a 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -388,7 +388,7 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) column.comment = *comment; if (codec) - column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(codec, data_type, false, true); + column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(codec, data_type, false, true, true); column.ttl = ttl; @@ -429,7 +429,7 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) else { if (codec) - column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(codec, data_type ? data_type : column.type, false, true); + column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(codec, data_type ? data_type : column.type, false, true, true); if (comment) column.comment = *comment; @@ -1067,7 +1067,7 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const "this column name is reserved for lightweight delete feature", backQuote(column_name)); if (command.codec) - CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs); + CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_qpl_deflate); all_columns.add(ColumnDescription(column_name, command.data_type)); } @@ -1093,7 +1093,7 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const { if (all_columns.hasAlias(column_name)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot specify codec for column type ALIAS"); - CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs); + CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_qpl_deflate); } auto column_default = all_columns.getDefault(column_name); if (column_default) diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 8eabae7929c..045afd7e6e6 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -130,7 +130,7 @@ void ColumnDescription::readText(ReadBuffer & buf) comment = col_ast->comment->as().value.get(); if (col_ast->codec) - codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(col_ast->codec, type, false, true); + codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(col_ast->codec, type, false, true, true); if (col_ast->ttl) ttl = col_ast->ttl; diff --git a/src/Storages/Distributed/DistributedSink.cpp b/src/Storages/Distributed/DistributedSink.cpp index 720a951299a..ce1dbde8eae 100644 --- a/src/Storages/Distributed/DistributedSink.cpp +++ b/src/Storages/Distributed/DistributedSink.cpp @@ -733,7 +733,7 @@ void DistributedSink::writeToShard(const Cluster::ShardInfo & shard_info, const if (compression_method == "ZSTD") compression_level = settings.network_zstd_compression_level; - CompressionCodecFactory::instance().validateCodec(compression_method, compression_level, !settings.allow_suspicious_codecs, settings.allow_experimental_codecs); + CompressionCodecFactory::instance().validateCodec(compression_method, compression_level, !settings.allow_suspicious_codecs, settings.allow_experimental_codecs, settings.enable_qpl_deflate); CompressionCodecPtr compression_codec = CompressionCodecFactory::instance().get(compression_method, compression_level); /// tmp directory is used to ensure atomicity of transactions diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index e1a80800630..f5209cbdff6 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -285,7 +285,7 @@ TTLDescription TTLDescription::getTTLFromAST( { result.recompression_codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST( - ttl_element->recompression_codec, {}, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs); + ttl_element->recompression_codec, {}, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_qpl_deflate); } } diff --git a/tests/integration/test_non_default_compression/configs/deflateqpl_compression_by_default.xml b/tests/integration/test_non_default_compression/configs/deflateqpl_compression_by_default.xml new file mode 100644 index 00000000000..2ad6a0f1eff --- /dev/null +++ b/tests/integration/test_non_default_compression/configs/deflateqpl_compression_by_default.xml @@ -0,0 +1,11 @@ + + + + + 0 + 0 + + deflate_qpl + + + diff --git a/tests/integration/test_non_default_compression/configs/enable_deflateqpl_codec.xml b/tests/integration/test_non_default_compression/configs/enable_deflateqpl_codec.xml new file mode 100644 index 00000000000..46e9e43ca27 --- /dev/null +++ b/tests/integration/test_non_default_compression/configs/enable_deflateqpl_codec.xml @@ -0,0 +1,7 @@ + + + + 1 + + + diff --git a/tests/integration/test_non_default_compression/test.py b/tests/integration/test_non_default_compression/test.py index e0a67a5db95..e69b32daae0 100644 --- a/tests/integration/test_non_default_compression/test.py +++ b/tests/integration/test_non_default_compression/test.py @@ -41,7 +41,14 @@ node6 = cluster.add_instance( main_configs=["configs/allow_experimental_codecs.xml"], user_configs=["configs/allow_suspicious_codecs.xml"], ) - +node7 = cluster.add_instance( + "node7", + main_configs=["configs/deflateqpl_compression_by_default.xml"], + user_configs=[ + "configs/enable_deflateqpl_codec.xml", + "configs/allow_suspicious_codecs.xml", + ], +) @pytest.fixture(scope="module") def start_cluster(): @@ -244,3 +251,59 @@ def test_uncompressed_cache_plus_zstd_codec(start_cluster): ) == "10000\n" ) + +def test_preconfigured_deflateqpl_codec(start_cluster): + node7.query( + """ + CREATE TABLE compression_codec_multiple_with_key ( + somedate Date CODEC(ZSTD, ZSTD, ZSTD(12), LZ4HC(12), DEFLATE_QPL), + id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC, DEFLATE_QPL), + data String CODEC(ZSTD(2), LZ4HC, NONE, LZ4, LZ4, DEFLATE_QPL), + somecolumn Float64 + ) ENGINE = MergeTree() PARTITION BY somedate ORDER BY id SETTINGS index_granularity = 2; + """ + ) + node7.query( + "INSERT INTO compression_codec_multiple_with_key VALUES(toDate('2018-10-12'), 100000, 'hello', 88.88), (toDate('2018-10-12'), 100002, 'world', 99.99), (toDate('2018-10-12'), 1111, '!', 777.777)" + ) + assert ( + node7.query( + "SELECT COUNT(*) FROM compression_codec_multiple_with_key WHERE id % 2 == 0" + ) + == "2\n" + ) + assert ( + node7.query( + "SELECT DISTINCT somecolumn FROM compression_codec_multiple_with_key ORDER BY id" + ) + == "777.777\n88.88\n99.99\n" + ) + assert ( + node7.query( + "SELECT data FROM compression_codec_multiple_with_key WHERE id >= 1112 AND somedate = toDate('2018-10-12') AND somecolumn <= 100" + ) + == "hello\nworld\n" + ) + + node7.query( + "INSERT INTO compression_codec_multiple_with_key SELECT toDate('2018-10-12'), number, toString(number), 1.0 FROM system.numbers LIMIT 10000" + ) + + assert ( + node7.query( + "SELECT COUNT(id) FROM compression_codec_multiple_with_key WHERE id % 10 == 0" + ) + == "1001\n" + ) + assert ( + node7.query( + "SELECT SUM(somecolumn) FROM compression_codec_multiple_with_key" + ) + == str(777.777 + 88.88 + 99.99 + 1.0 * 10000) + "\n" + ) + assert ( + node7.query( + "SELECT count(*) FROM compression_codec_multiple_with_key GROUP BY somedate" + ) + == "10003\n" + ) diff --git a/tests/queries/0_stateless/00804_test_alter_compression_codecs.reference b/tests/queries/0_stateless/00804_test_alter_compression_codecs.reference index cfbfadf1e67..a6afe11126c 100644 --- a/tests/queries/0_stateless/00804_test_alter_compression_codecs.reference +++ b/tests/queries/0_stateless/00804_test_alter_compression_codecs.reference @@ -12,13 +12,7 @@ CODEC(NONE) 2018-01-01 4 4 2018-01-01 5 5 2018-01-01 6 6 -2018-01-01 1 default_value -2018-01-01 2 default_value -2018-01-01 3 3 -2018-01-01 4 4 -2018-01-01 5 5 -2018-01-01 6 6 -CODEC(ZSTD(1), LZ4HC(0), LZ4, LZ4, NONE) +CODEC(DEFLATE_QPL) 2018-01-01 1 default_value 2018-01-01 2 default_value 2018-01-01 3 3 @@ -27,7 +21,26 @@ CODEC(ZSTD(1), LZ4HC(0), LZ4, LZ4, NONE) 2018-01-01 6 6 2018-01-01 7 7 2018-01-01 8 8 -CODEC(ZSTD(1), LZ4HC(0), LZ4, LZ4, NONE) -CODEC(NONE, LZ4, LZ4HC(0), ZSTD(1)) +2018-01-01 1 default_value +2018-01-01 2 default_value +2018-01-01 3 3 +2018-01-01 4 4 +2018-01-01 5 5 +2018-01-01 6 6 +2018-01-01 7 7 +2018-01-01 8 8 +CODEC(ZSTD(1), LZ4HC(0), LZ4, LZ4, DEFLATE_QPL, NONE) +2018-01-01 1 default_value +2018-01-01 2 default_value +2018-01-01 3 3 +2018-01-01 4 4 +2018-01-01 5 5 +2018-01-01 6 6 +2018-01-01 7 7 +2018-01-01 8 8 +2018-01-01 9 9 +2018-01-01 10 10 +CODEC(ZSTD(1), LZ4HC(0), LZ4, LZ4, DEFLATE_QPL, NONE) +CODEC(NONE, LZ4, LZ4HC(0), ZSTD(1), DEFLATE_QPL) 2 1 diff --git a/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql b/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql index 85e5f8b63ad..40a8bb4c7cb 100644 --- a/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql +++ b/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql @@ -25,15 +25,23 @@ INSERT INTO alter_compression_codec VALUES('2018-01-01', 5, '5'); INSERT INTO alter_compression_codec VALUES('2018-01-01', 6, '6'); SELECT * FROM alter_compression_codec ORDER BY id; -OPTIMIZE TABLE alter_compression_codec FINAL; -SELECT * FROM alter_compression_codec ORDER BY id; - -SET allow_suspicious_codecs = 1; -ALTER TABLE alter_compression_codec MODIFY COLUMN alter_column CODEC(ZSTD, LZ4HC, LZ4, LZ4, NONE); +SET enable_qpl_deflate = 1; +ALTER TABLE alter_compression_codec MODIFY COLUMN alter_column CODEC(DEFLATE_QPL); SELECT compression_codec FROM system.columns WHERE database = currentDatabase() AND table = 'alter_compression_codec' AND name = 'alter_column'; INSERT INTO alter_compression_codec VALUES('2018-01-01', 7, '7'); INSERT INTO alter_compression_codec VALUES('2018-01-01', 8, '8'); +SELECT * FROM alter_compression_codec ORDER BY id; + +OPTIMIZE TABLE alter_compression_codec FINAL; +SELECT * FROM alter_compression_codec ORDER BY id; + +SET allow_suspicious_codecs = 1; +ALTER TABLE alter_compression_codec MODIFY COLUMN alter_column CODEC(ZSTD, LZ4HC, LZ4, LZ4, DEFLATE_QPL, NONE); +SELECT compression_codec FROM system.columns WHERE database = currentDatabase() AND table = 'alter_compression_codec' AND name = 'alter_column'; + +INSERT INTO alter_compression_codec VALUES('2018-01-01', 9, '9'); +INSERT INTO alter_compression_codec VALUES('2018-01-01', 10, '10'); OPTIMIZE TABLE alter_compression_codec FINAL; SELECT * FROM alter_compression_codec ORDER BY id; @@ -54,15 +62,17 @@ ALTER TABLE alter_bad_codec ADD COLUMN alter_column DateTime DEFAULT '2019-01-01 ALTER TABLE alter_bad_codec ADD COLUMN alter_column DateTime DEFAULT '2019-01-01 00:00:00' CODEC(ZSTD(100)); -- { serverError 433 } +ALTER TABLE alter_bad_codec ADD COLUMN alter_column DateTime DEFAULT '2019-01-01 00:00:00' CODEC(DEFLATE_QPL(100)); -- { serverError 378 } + DROP TABLE IF EXISTS alter_bad_codec; DROP TABLE IF EXISTS large_alter_table_00804; DROP TABLE IF EXISTS store_of_hash_00804; CREATE TABLE large_alter_table_00804 ( - somedate Date CODEC(ZSTD, ZSTD, ZSTD(12), LZ4HC(12)), - id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC), - data String CODEC(ZSTD(2), LZ4HC, NONE, LZ4, LZ4) + somedate Date CODEC(ZSTD, ZSTD, ZSTD(12), LZ4HC(12), DEFLATE_QPL), + id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC, DEFLATE_QPL), + data String CODEC(ZSTD(2), LZ4HC, NONE, LZ4, LZ4, DEFLATE_QPL) ) ENGINE = MergeTree() PARTITION BY somedate ORDER BY id SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi', min_bytes_for_wide_part = 0; INSERT INTO large_alter_table_00804 SELECT toDate('2019-01-01'), number, toString(number + rand()) FROM system.numbers LIMIT 300000; @@ -71,7 +81,7 @@ CREATE TABLE store_of_hash_00804 (hash UInt64) ENGINE = Memory(); INSERT INTO store_of_hash_00804 SELECT sum(cityHash64(*)) FROM large_alter_table_00804; -ALTER TABLE large_alter_table_00804 MODIFY COLUMN data CODEC(NONE, LZ4, LZ4HC, ZSTD); +ALTER TABLE large_alter_table_00804 MODIFY COLUMN data CODEC(NONE, LZ4, LZ4HC, ZSTD, DEFLATE_QPL); OPTIMIZE TABLE large_alter_table_00804; From d85bc02388317ed4b2743814bcc217baf1652971 Mon Sep 17 00:00:00 2001 From: jinjunzh Date: Wed, 24 May 2023 15:08:23 -0400 Subject: [PATCH 0708/1072] add function test for deflate_qpl --- ...4_test_custom_compression_codecs.reference | 8 ++-- .../00804_test_custom_compression_codecs.sql | 45 +++++++++++-------- ...m_compression_codes_log_storages.reference | 20 ++++----- ..._custom_compression_codes_log_storages.sql | 41 +++++++++-------- ...st_deflate_qpl_codec_compression.reference | 4 ++ ...804_test_deflate_qpl_codec_compression.sql | 32 +++++++++++++ ...804_test_delta_codec_compression.reference | 2 + .../00804_test_delta_codec_compression.sql | 38 ++++++++++++++++ 8 files changed, 140 insertions(+), 50 deletions(-) create mode 100644 tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.reference create mode 100644 tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codecs.reference b/tests/queries/0_stateless/00804_test_custom_compression_codecs.reference index 7bd91e5a69b..a9cbe3d32d3 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codecs.reference +++ b/tests/queries/0_stateless/00804_test_custom_compression_codecs.reference @@ -1,6 +1,6 @@ -1 hello 2018-12-14 1.1 aaa 5 -2 world 2018-12-15 2.2 bbb 6 -3 ! 2018-12-16 3.3 ccc 7 +1 hello 2018-12-14 1.1 aaa 5 qpl11 11 +2 world 2018-12-15 2.2 bbb 6 qpl22 22 +3 ! 2018-12-16 3.3 ccc 7 qpl33 33 2 1 world 2018-10-05 1.1 2 hello 2018-10-01 2.2 @@ -9,7 +9,7 @@ 10003 274972506.6 9175437371954010821 -CREATE TABLE default.compression_codec_multiple_more_types\n(\n `id` Decimal(38, 13) CODEC(ZSTD(1), LZ4, ZSTD(1), ZSTD(1), Delta(2), Delta(4), Delta(1), LZ4HC(0)),\n `data` FixedString(12) CODEC(ZSTD(1), ZSTD(1), NONE, NONE, NONE, LZ4HC(0)),\n `ddd.age` Array(UInt8) CODEC(LZ4, LZ4HC(0), NONE, NONE, NONE, ZSTD(1), Delta(8)),\n `ddd.Name` Array(String) CODEC(LZ4, LZ4HC(0), NONE, NONE, NONE, ZSTD(1), Delta(8))\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 +CREATE TABLE default.compression_codec_multiple_more_types\n(\n `id` Decimal(38, 13) CODEC(ZSTD(1), LZ4, ZSTD(1), ZSTD(1), Delta(2), Delta(4), Delta(1), LZ4HC(0), DEFLATE_QPL),\n `data` FixedString(12) CODEC(ZSTD(1), ZSTD(1), NONE, NONE, NONE, LZ4HC(0), DEFLATE_QPL),\n `ddd.age` Array(UInt8) CODEC(LZ4, LZ4HC(0), NONE, NONE, NONE, ZSTD(1), Delta(8), DEFLATE_QPL),\n `ddd.Name` Array(String) CODEC(LZ4, LZ4HC(0), NONE, NONE, NONE, ZSTD(1), Delta(8), DEFLATE_QPL)\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 1.5555555555555 hello world! [77] ['John'] 7.1 xxxxxxxxxxxx [127] ['Henry'] ! diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql b/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql index c080c2fc98e..44a0daada27 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql +++ b/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql @@ -1,5 +1,6 @@ SET send_logs_level = 'fatal'; SET allow_suspicious_codecs = 1; +SET enable_qpl_deflate = 1; DROP TABLE IF EXISTS compression_codec; @@ -9,18 +10,20 @@ CREATE TABLE compression_codec( ddd Date CODEC(NONE), somenum Float64 CODEC(ZSTD(2)), somestr FixedString(3) CODEC(LZ4HC(7)), - othernum Int64 CODEC(Delta) + othernum Int64 CODEC(Delta), + qplstr String CODEC(DEFLATE_QPL), + qplnum UInt32 CODEC(DEFLATE_QPL), ) ENGINE = MergeTree() ORDER BY tuple(); -INSERT INTO compression_codec VALUES(1, 'hello', toDate('2018-12-14'), 1.1, 'aaa', 5); -INSERT INTO compression_codec VALUES(2, 'world', toDate('2018-12-15'), 2.2, 'bbb', 6); -INSERT INTO compression_codec VALUES(3, '!', toDate('2018-12-16'), 3.3, 'ccc', 7); +INSERT INTO compression_codec VALUES(1, 'hello', toDate('2018-12-14'), 1.1, 'aaa', 5, 'qpl11', 11); +INSERT INTO compression_codec VALUES(2, 'world', toDate('2018-12-15'), 2.2, 'bbb', 6,'qpl22', 22); +INSERT INTO compression_codec VALUES(3, '!', toDate('2018-12-16'), 3.3, 'ccc', 7, 'qpl33', 33); SELECT * FROM compression_codec ORDER BY id; OPTIMIZE TABLE compression_codec FINAL; -INSERT INTO compression_codec VALUES(2, '', toDate('2018-12-13'), 4.4, 'ddd', 8); +INSERT INTO compression_codec VALUES(2, '', toDate('2018-12-13'), 4.4, 'ddd', 8, 'qpl44', 44); DETACH TABLE compression_codec; ATTACH TABLE compression_codec; @@ -31,25 +34,31 @@ DROP TABLE IF EXISTS compression_codec; DROP TABLE IF EXISTS bad_codec; DROP TABLE IF EXISTS params_when_no_params; +DROP TABLE IF EXISTS params_when_no_params2; DROP TABLE IF EXISTS too_many_params; DROP TABLE IF EXISTS codec_multiple_direct_specification_1; DROP TABLE IF EXISTS codec_multiple_direct_specification_2; +DROP TABLE IF EXISTS codec_multiple_direct_specification_3; DROP TABLE IF EXISTS delta_bad_params1; DROP TABLE IF EXISTS delta_bad_params2; CREATE TABLE bad_codec(id UInt64 CODEC(adssadads)) ENGINE = MergeTree() order by tuple(); -- { serverError 432 } CREATE TABLE too_many_params(id UInt64 CODEC(ZSTD(2,3,4,5))) ENGINE = MergeTree() order by tuple(); -- { serverError 431 } CREATE TABLE params_when_no_params(id UInt64 CODEC(LZ4(1))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 378 } +CREATE TABLE params_when_no_params2(id UInt64 CODEC(DEFLATE_QPL(1))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 378 } CREATE TABLE codec_multiple_direct_specification_1(id UInt64 CODEC(MULTIPLE(LZ4, ZSTD))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 432 } CREATE TABLE codec_multiple_direct_specification_2(id UInt64 CODEC(multiple(LZ4, ZSTD))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 432 } +CREATE TABLE codec_multiple_direct_specification_3(id UInt64 CODEC(multiple(LZ4, DEFLATE_QPL))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 432 } CREATE TABLE delta_bad_params1(id UInt64 CODEC(Delta(3))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 433 } CREATE TABLE delta_bad_params2(id UInt64 CODEC(Delta(16))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 433 } DROP TABLE IF EXISTS bad_codec; DROP TABLE IF EXISTS params_when_no_params; +DROP TABLE IF EXISTS params_when_no_params2; DROP TABLE IF EXISTS too_many_params; DROP TABLE IF EXISTS codec_multiple_direct_specification_1; DROP TABLE IF EXISTS codec_multiple_direct_specification_2; +DROP TABLE IF EXISTS codec_multiple_direct_specification_3; DROP TABLE IF EXISTS delta_bad_params1; DROP TABLE IF EXISTS delta_bad_params2; @@ -58,10 +67,10 @@ DROP TABLE IF EXISTS compression_codec_multiple; SET network_compression_method = 'lz4hc'; CREATE TABLE compression_codec_multiple ( - id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC, Delta(4)), - data String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC, LZ4, LZ4, Delta(8)), - ddd Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD, LZ4HC, LZ4HC), - somenum Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD) + id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC, Delta(4), DEFLATE_QPL), + data String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC, LZ4, LZ4, Delta(8), DEFLATE_QPL), + ddd Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD, LZ4HC, LZ4HC, DEFLATE_QPL), + somenum Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD, DEFLATE_QPL) ) ENGINE = MergeTree() ORDER BY tuple(); INSERT INTO compression_codec_multiple VALUES (1, 'world', toDate('2018-10-05'), 1.1), (2, 'hello', toDate('2018-10-01'), 2.2), (3, 'buy', toDate('2018-10-11'), 3.3); @@ -85,15 +94,15 @@ SELECT sum(cityHash64(*)) FROM compression_codec_multiple; DROP TABLE IF EXISTS compression_codec_multiple_more_types; CREATE TABLE compression_codec_multiple_more_types ( - id Decimal128(13) CODEC(ZSTD, LZ4, ZSTD, ZSTD, Delta(2), Delta(4), Delta(1), LZ4HC), - data FixedString(12) CODEC(ZSTD, ZSTD, Delta, Delta, Delta, NONE, NONE, NONE, LZ4HC), - ddd Nested (age UInt8, Name String) CODEC(LZ4, LZ4HC, NONE, NONE, NONE, ZSTD, Delta(8)) + id Decimal128(13) CODEC(ZSTD, LZ4, ZSTD, ZSTD, Delta(2), Delta(4), Delta(1), LZ4HC, DEFLATE_QPL), + data FixedString(12) CODEC(ZSTD, ZSTD, Delta, Delta, Delta, NONE, NONE, NONE, LZ4HC, DEFLATE_QPL), + ddd Nested (age UInt8, Name String) CODEC(LZ4, LZ4HC, NONE, NONE, NONE, ZSTD, Delta(8), DEFLATE_QPL) ) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 36 } CREATE TABLE compression_codec_multiple_more_types ( - id Decimal128(13) CODEC(ZSTD, LZ4, ZSTD, ZSTD, Delta(2), Delta(4), Delta(1), LZ4HC), - data FixedString(12) CODEC(ZSTD, ZSTD, NONE, NONE, NONE, LZ4HC), - ddd Nested (age UInt8, Name String) CODEC(LZ4, LZ4HC, NONE, NONE, NONE, ZSTD, Delta(8)) + id Decimal128(13) CODEC(ZSTD, LZ4, ZSTD, ZSTD, Delta(2), Delta(4), Delta(1), LZ4HC, DEFLATE_QPL), + data FixedString(12) CODEC(ZSTD, ZSTD, NONE, NONE, NONE, LZ4HC, DEFLATE_QPL), + ddd Nested (age UInt8, Name String) CODEC(LZ4, LZ4HC, NONE, NONE, NONE, ZSTD, Delta(8), DEFLATE_QPL) ) ENGINE = MergeTree() ORDER BY tuple(); SHOW CREATE TABLE compression_codec_multiple_more_types; @@ -109,9 +118,9 @@ SET network_compression_method = 'zstd'; SET network_zstd_compression_level = 5; CREATE TABLE compression_codec_multiple_with_key ( - somedate Date CODEC(ZSTD, ZSTD, ZSTD(12), LZ4HC(12), Delta, Delta), - id UInt64 CODEC(LZ4, ZSTD, Delta, NONE, LZ4HC, Delta), - data String CODEC(ZSTD(2), Delta(1), LZ4HC, NONE, LZ4, LZ4) + somedate Date CODEC(ZSTD, ZSTD, ZSTD(12), LZ4HC(12), Delta, Delta, DEFLATE_QPL), + id UInt64 CODEC(LZ4, ZSTD, Delta, NONE, LZ4HC, Delta, DEFLATE_QPL), + data String CODEC(ZSTD(2), Delta(1), LZ4HC, NONE, LZ4, LZ4, DEFLATE_QPL) ) ENGINE = MergeTree() PARTITION BY somedate ORDER BY id SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi'; diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.reference b/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.reference index 8145ca99829..d64b8a77eed 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.reference +++ b/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.reference @@ -1,9 +1,9 @@ -CREATE TABLE default.compression_codec_log\n(\n `id` UInt64 CODEC(LZ4),\n `data` String CODEC(ZSTD(1)),\n `ddd` Date CODEC(NONE),\n `somenum` Float64 CODEC(ZSTD(2)),\n `somestr` FixedString(3) CODEC(LZ4HC(7)),\n `othernum` Int64 CODEC(Delta(8))\n)\nENGINE = Log -1 hello 2018-12-14 1.1 aaa 5 -2 world 2018-12-15 2.2 bbb 6 -3 ! 2018-12-16 3.3 ccc 7 +CREATE TABLE default.compression_codec_log\n(\n `id` UInt64 CODEC(LZ4),\n `data` String CODEC(ZSTD(1)),\n `ddd` Date CODEC(NONE),\n `somenum` Float64 CODEC(ZSTD(2)),\n `somestr` FixedString(3) CODEC(LZ4HC(7)),\n `othernum` Int64 CODEC(Delta(8)),\n `qplstr` String CODEC(DEFLATE_QPL),\n `qplnum` UInt32 CODEC(DEFLATE_QPL)\n)\nENGINE = Log +1 hello 2018-12-14 1.1 aaa 5 qpl11 11 +2 world 2018-12-15 2.2 bbb 6 qpl22 22 +3 ! 2018-12-16 3.3 ccc 7 qpl33 33 2 -CREATE TABLE default.compression_codec_multiple_log\n(\n `id` UInt64 CODEC(LZ4, ZSTD(1), NONE, LZ4HC(0), Delta(4)),\n `data` String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC(0), LZ4, LZ4, Delta(8)),\n `ddd` Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD(1), LZ4HC(0), LZ4HC(0)),\n `somenum` Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD(1))\n)\nENGINE = Log +CREATE TABLE default.compression_codec_multiple_log\n(\n `id` UInt64 CODEC(LZ4, ZSTD(1), NONE, LZ4HC(0), Delta(4), DEFLATE_QPL),\n `data` String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC(0), LZ4, LZ4, Delta(8), DEFLATE_QPL),\n `ddd` Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD(1), LZ4HC(0), LZ4HC(0), DEFLATE_QPL),\n `somenum` Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD(1), DEFLATE_QPL)\n)\nENGINE = Log 1 world 2018-10-05 1.1 2 hello 2018-10-01 2.2 3 buy 2018-10-11 3.3 @@ -11,12 +11,12 @@ CREATE TABLE default.compression_codec_multiple_log\n(\n `id` UInt64 CODEC(LZ 10003 274972506.6 9175437371954010821 -CREATE TABLE default.compression_codec_tiny_log\n(\n `id` UInt64 CODEC(LZ4),\n `data` String CODEC(ZSTD(1)),\n `ddd` Date CODEC(NONE),\n `somenum` Float64 CODEC(ZSTD(2)),\n `somestr` FixedString(3) CODEC(LZ4HC(7)),\n `othernum` Int64 CODEC(Delta(8))\n)\nENGINE = TinyLog -1 hello 2018-12-14 1.1 aaa 5 -2 world 2018-12-15 2.2 bbb 6 -3 ! 2018-12-16 3.3 ccc 7 +CREATE TABLE default.compression_codec_tiny_log\n(\n `id` UInt64 CODEC(LZ4),\n `data` String CODEC(ZSTD(1)),\n `ddd` Date CODEC(NONE),\n `somenum` Float64 CODEC(ZSTD(2)),\n `somestr` FixedString(3) CODEC(LZ4HC(7)),\n `othernum` Int64 CODEC(Delta(8)),\n `qplstr` String CODEC(DEFLATE_QPL),\n `qplnum` UInt32 CODEC(DEFLATE_QPL)\n)\nENGINE = TinyLog +1 hello 2018-12-14 1.1 aaa 5 qpl11 11 +2 world 2018-12-15 2.2 bbb 6 qpl22 22 +3 ! 2018-12-16 3.3 ccc 7 qpl33 33 2 -CREATE TABLE default.compression_codec_multiple_tiny_log\n(\n `id` UInt64 CODEC(LZ4, ZSTD(1), NONE, LZ4HC(0), Delta(4)),\n `data` String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC(0), LZ4, LZ4, Delta(8)),\n `ddd` Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD(1), LZ4HC(0), LZ4HC(0)),\n `somenum` Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD(1))\n)\nENGINE = TinyLog +CREATE TABLE default.compression_codec_multiple_tiny_log\n(\n `id` UInt64 CODEC(LZ4, ZSTD(1), NONE, LZ4HC(0), Delta(4), DEFLATE_QPL),\n `data` String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC(0), LZ4, LZ4, Delta(8), DEFLATE_QPL),\n `ddd` Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD(1), LZ4HC(0), LZ4HC(0), DEFLATE_QPL),\n `somenum` Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD(1), DEFLATE_QPL)\n)\nENGINE = TinyLog 1 world 2018-10-05 1.1 2 hello 2018-10-01 2.2 3 buy 2018-10-11 3.3 diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql b/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql index fba6a216762..113f26732e7 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql +++ b/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql @@ -1,5 +1,6 @@ SET send_logs_level = 'fatal'; SET allow_suspicious_codecs = 1; +SET enable_qpl_deflate = 1; -- copy-paste for storage log @@ -11,18 +12,20 @@ CREATE TABLE compression_codec_log( ddd Date CODEC(NONE), somenum Float64 CODEC(ZSTD(2)), somestr FixedString(3) CODEC(LZ4HC(7)), - othernum Int64 CODEC(Delta) + othernum Int64 CODEC(Delta), + qplstr String CODEC(DEFLATE_QPL), + qplnum UInt32 CODEC(DEFLATE_QPL), ) ENGINE = Log(); SHOW CREATE TABLE compression_codec_log; -INSERT INTO compression_codec_log VALUES(1, 'hello', toDate('2018-12-14'), 1.1, 'aaa', 5); -INSERT INTO compression_codec_log VALUES(2, 'world', toDate('2018-12-15'), 2.2, 'bbb', 6); -INSERT INTO compression_codec_log VALUES(3, '!', toDate('2018-12-16'), 3.3, 'ccc', 7); +INSERT INTO compression_codec_log VALUES(1, 'hello', toDate('2018-12-14'), 1.1, 'aaa', 5, 'qpl11', 11); +INSERT INTO compression_codec_log VALUES(2, 'world', toDate('2018-12-15'), 2.2, 'bbb', 6,'qpl22', 22); +INSERT INTO compression_codec_log VALUES(3, '!', toDate('2018-12-16'), 3.3, 'ccc', 7, 'qpl33', 33); SELECT * FROM compression_codec_log ORDER BY id; -INSERT INTO compression_codec_log VALUES(2, '', toDate('2018-12-13'), 4.4, 'ddd', 8); +INSERT INTO compression_codec_log VALUES(2, '', toDate('2018-12-13'), 4.4, 'ddd', 8, 'qpl44', 44); DETACH TABLE compression_codec_log; ATTACH TABLE compression_codec_log; @@ -34,10 +37,10 @@ DROP TABLE IF EXISTS compression_codec_log; DROP TABLE IF EXISTS compression_codec_multiple_log; CREATE TABLE compression_codec_multiple_log ( - id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC, Delta(4)), - data String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC, LZ4, LZ4, Delta(8)), - ddd Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD, LZ4HC, LZ4HC), - somenum Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD) + id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC, Delta(4), DEFLATE_QPL), + data String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC, LZ4, LZ4, Delta(8), DEFLATE_QPL), + ddd Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD, LZ4HC, LZ4HC, DEFLATE_QPL), + somenum Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD, DEFLATE_QPL) ) ENGINE = Log(); SHOW CREATE TABLE compression_codec_multiple_log; @@ -69,18 +72,20 @@ CREATE TABLE compression_codec_tiny_log( ddd Date CODEC(NONE), somenum Float64 CODEC(ZSTD(2)), somestr FixedString(3) CODEC(LZ4HC(7)), - othernum Int64 CODEC(Delta) + othernum Int64 CODEC(Delta), + qplstr String CODEC(DEFLATE_QPL), + qplnum UInt32 CODEC(DEFLATE_QPL), ) ENGINE = TinyLog(); SHOW CREATE TABLE compression_codec_tiny_log; -INSERT INTO compression_codec_tiny_log VALUES(1, 'hello', toDate('2018-12-14'), 1.1, 'aaa', 5); -INSERT INTO compression_codec_tiny_log VALUES(2, 'world', toDate('2018-12-15'), 2.2, 'bbb', 6); -INSERT INTO compression_codec_tiny_log VALUES(3, '!', toDate('2018-12-16'), 3.3, 'ccc', 7); +INSERT INTO compression_codec_tiny_log VALUES(1, 'hello', toDate('2018-12-14'), 1.1, 'aaa', 5, 'qpl11', 11); +INSERT INTO compression_codec_tiny_log VALUES(2, 'world', toDate('2018-12-15'), 2.2, 'bbb', 6, 'qpl22', 22); +INSERT INTO compression_codec_tiny_log VALUES(3, '!', toDate('2018-12-16'), 3.3, 'ccc', 7, 'qpl33', 33); SELECT * FROM compression_codec_tiny_log ORDER BY id; -INSERT INTO compression_codec_tiny_log VALUES(2, '', toDate('2018-12-13'), 4.4, 'ddd', 8); +INSERT INTO compression_codec_tiny_log VALUES(2, '', toDate('2018-12-13'), 4.4, 'ddd', 8, 'qpl44', 44); DETACH TABLE compression_codec_tiny_log; ATTACH TABLE compression_codec_tiny_log; @@ -92,10 +97,10 @@ DROP TABLE IF EXISTS compression_codec_tiny_log; DROP TABLE IF EXISTS compression_codec_multiple_tiny_log; CREATE TABLE compression_codec_multiple_tiny_log ( - id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC, Delta(4)), - data String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC, LZ4, LZ4, Delta(8)), - ddd Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD, LZ4HC, LZ4HC), - somenum Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD) + id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC, Delta(4), DEFLATE_QPL), + data String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC, LZ4, LZ4, Delta(8), DEFLATE_QPL), + ddd Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD, LZ4HC, LZ4HC, DEFLATE_QPL), + somenum Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD, DEFLATE_QPL) ) ENGINE = TinyLog(); SHOW CREATE TABLE compression_codec_multiple_tiny_log; diff --git a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.reference b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.reference new file mode 100644 index 00000000000..88d274d9cba --- /dev/null +++ b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.reference @@ -0,0 +1,4 @@ +1 hello 2018-12-14 1.1 aaa 5 qpl11 11 +2 world 2018-12-15 2.2 bbb 6 qpl22 22 +3 ! 2018-12-16 3.3 ccc 7 qpl33 33 +2 diff --git a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql new file mode 100644 index 00000000000..fe23e49804d --- /dev/null +++ b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql @@ -0,0 +1,32 @@ +SET send_logs_level = 'fatal'; +SET enable_qpl_deflate = 1; + +DROP TABLE IF EXISTS compression_codec; + +CREATE TABLE compression_codec( + id UInt64 CODEC(DEFLATE_QPL), + data String CODEC(DEFLATE_QPL), + ddd Date CODEC(DEFLATE_QPL), + somenum Float64 CODEC(DEFLATE_QPL), + somestr FixedString(3) CODEC(DEFLATE_QPL), + othernum Int64 CODEC(DEFLATE_QPL), + qplstr String CODEC(DEFLATE_QPL), + qplnum UInt32 CODEC(DEFLATE_QPL), +) ENGINE = MergeTree() ORDER BY tuple(); + +INSERT INTO compression_codec VALUES(1, 'hello', toDate('2018-12-14'), 1.1, 'aaa', 5, 'qpl11', 11); +INSERT INTO compression_codec VALUES(2, 'world', toDate('2018-12-15'), 2.2, 'bbb', 6,'qpl22', 22); +INSERT INTO compression_codec VALUES(3, '!', toDate('2018-12-16'), 3.3, 'ccc', 7, 'qpl33', 33); + +SELECT * FROM compression_codec ORDER BY id; + +OPTIMIZE TABLE compression_codec FINAL; + +INSERT INTO compression_codec VALUES(2, '', toDate('2018-12-13'), 4.4, 'ddd', 8, 'qpl44', 44); + +DETACH TABLE compression_codec; +ATTACH TABLE compression_codec; + +SELECT count(*) FROM compression_codec WHERE id = 2 GROUP BY id; + +DROP TABLE IF EXISTS compression_codec; diff --git a/tests/queries/0_stateless/00804_test_delta_codec_compression.reference b/tests/queries/0_stateless/00804_test_delta_codec_compression.reference index 949d37ed27a..37f9d4901b3 100644 --- a/tests/queries/0_stateless/00804_test_delta_codec_compression.reference +++ b/tests/queries/0_stateless/00804_test_delta_codec_compression.reference @@ -4,3 +4,5 @@ 1 32 1 +17 +1 diff --git a/tests/queries/0_stateless/00804_test_delta_codec_compression.sql b/tests/queries/0_stateless/00804_test_delta_codec_compression.sql index 25988f6474b..f9805246662 100644 --- a/tests/queries/0_stateless/00804_test_delta_codec_compression.sql +++ b/tests/queries/0_stateless/00804_test_delta_codec_compression.sql @@ -115,3 +115,41 @@ USING(key); DROP TABLE IF EXISTS delta_codec_string; DROP TABLE IF EXISTS default_codec_string; + +SET enable_qpl_deflate = 1; +DROP TABLE IF EXISTS delta_codec_string_qpl; +DROP TABLE IF EXISTS default_codec_string_qpl; + +CREATE TABLE delta_codec_string_qpl +( + id Float64 Codec(Delta, DEFLATE_QPL) +) ENGINE MergeTree() ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0, compress_marks = false, compress_primary_key=false; + +CREATE TABLE default_codec_string_qpl +( + id Float64 Codec(DEFLATE_QPL) +) ENGINE MergeTree() ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0, compress_marks = false, compress_primary_key=false; + +INSERT INTO delta_codec_string_qpl SELECT concat(toString(number), toString(number % 100)) FROM numbers(1547510400, 500000); +INSERT INTO default_codec_string_qpl SELECT * from delta_codec_string_qpl; + +OPTIMIZE TABLE delta_codec_string_qpl FINAL; +OPTIMIZE TABLE default_codec_string_qpl FINAL; + +SELECT + floor(big_size / small_size) as ratio +FROM + (SELECT 1 AS key, sum(bytes_on_disk) AS small_size FROM system.parts WHERE database = currentDatabase() and table = 'delta_codec_string_qpl' and active) +INNER JOIN + (SELECT 1 AS key, sum(bytes_on_disk) as big_size FROM system.parts WHERE database = currentDatabase() and table = 'default_codec_string_qpl' and active) USING(key); + +SELECT + small_hash == big_hash +FROM + (SELECT 1 AS key, sum(cityHash64(*)) AS small_hash FROM delta_codec_string_qpl) +INNER JOIN + (SELECT 1 AS key, sum(cityHash64(*)) AS big_hash FROM default_codec_string_qpl) +USING(key); + +DROP TABLE IF EXISTS delta_codec_string_qpl; +DROP TABLE IF EXISTS default_codec_string_qpl; From 31173ab55b0926f634c2fbfc06f7d2f34410a4ff Mon Sep 17 00:00:00 2001 From: jinjunzh Date: Wed, 24 May 2023 15:15:40 -0400 Subject: [PATCH 0709/1072] add sections of deflate_qpl for stress test and performance test --- tests/ci/stress.py | 1 + tests/performance/codecs_float_insert.xml | 2 ++ tests/performance/codecs_float_select.xml | 2 ++ tests/performance/codecs_int_insert.xml | 2 ++ tests/performance/codecs_int_select.xml | 2 ++ 5 files changed, 9 insertions(+) diff --git a/tests/ci/stress.py b/tests/ci/stress.py index b9044874071..b95cac9044e 100755 --- a/tests/ci/stress.py +++ b/tests/ci/stress.py @@ -20,6 +20,7 @@ def get_options(i, upgrade_check): '''--db-engine="Replicated('/test/db/test_{}', 's1', 'r1')"'''.format(i) ) client_options.append("allow_experimental_database_replicated=1") + client_options.append("enable_qpl_deflate=1") # If database name is not specified, new database is created for each functional test. # Run some threads with one database for all tests. diff --git a/tests/performance/codecs_float_insert.xml b/tests/performance/codecs_float_insert.xml index 64325d30189..25291f7f499 100644 --- a/tests/performance/codecs_float_insert.xml +++ b/tests/performance/codecs_float_insert.xml @@ -1,6 +1,7 @@ 1 + 1 @@ -10,6 +11,7 @@ NONE LZ4 ZSTD + DEFLATE_QPL DoubleDelta Gorilla FPC diff --git a/tests/performance/codecs_float_select.xml b/tests/performance/codecs_float_select.xml index 325c140d9a0..bb67987c75e 100644 --- a/tests/performance/codecs_float_select.xml +++ b/tests/performance/codecs_float_select.xml @@ -1,6 +1,7 @@ 1 + 1 @@ -10,6 +11,7 @@ NONE LZ4 ZSTD + DEFLATE_QPL DoubleDelta Gorilla FPC diff --git a/tests/performance/codecs_int_insert.xml b/tests/performance/codecs_int_insert.xml index 618e20160f8..1db9ee8f746 100644 --- a/tests/performance/codecs_int_insert.xml +++ b/tests/performance/codecs_int_insert.xml @@ -1,6 +1,7 @@ 1 + 1 @@ -10,6 +11,7 @@ NONE LZ4 ZSTD + DEFLATE_QPL Delta T64 DoubleDelta diff --git a/tests/performance/codecs_int_select.xml b/tests/performance/codecs_int_select.xml index 62c1ee16e7b..5dc7ab48704 100644 --- a/tests/performance/codecs_int_select.xml +++ b/tests/performance/codecs_int_select.xml @@ -1,6 +1,7 @@ 1 + 1 @@ -10,6 +11,7 @@ NONE LZ4 ZSTD + DEFLATE_QPL Delta T64 DoubleDelta From cbdb408ec8330c8ce469c68e979ca208c76d0629 Mon Sep 17 00:00:00 2001 From: jinjunzh Date: Fri, 26 May 2023 12:15:34 -0400 Subject: [PATCH 0710/1072] add USE_QPL for buildoptions --- src/Storages/System/StorageSystemBuildOptions.cpp.in | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/System/StorageSystemBuildOptions.cpp.in b/src/Storages/System/StorageSystemBuildOptions.cpp.in index 3465e47449b..c2a188e7750 100644 --- a/src/Storages/System/StorageSystemBuildOptions.cpp.in +++ b/src/Storages/System/StorageSystemBuildOptions.cpp.in @@ -68,6 +68,7 @@ const char * auto_config_build[] "GIT_BRANCH", R"IRjaNsZIL9Yh7FQ4(@GIT_BRANCH@)IRjaNsZIL9Yh7FQ4", "GIT_DATE", "@GIT_DATE@", "GIT_COMMIT_SUBJECT", R"Gi17KJMlbGCjErEN(@GIT_COMMIT_SUBJECT@)Gi17KJMlbGCjErEN", + "USE_QPL", "@ENABLE_QPL@", nullptr, nullptr }; From f1192d59afa7ee2271d7ee6b5cb9d98bb27254a0 Mon Sep 17 00:00:00 2001 From: jinjunzh Date: Thu, 1 Jun 2023 12:42:22 -0400 Subject: [PATCH 0711/1072] refine patch according to comments --- .../sql-reference/statements/create/table.md | 2 +- src/Client/Connection.cpp | 2 +- src/Compression/CompressionFactory.h | 4 +- .../CompressionFactoryAdditions.cpp | 12 +++--- src/Compression/ICompressionCodec.h | 6 +-- src/Core/Settings.h | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 4 +- src/Server/TCPHandler.cpp | 2 +- src/Storages/AlterCommands.cpp | 4 +- src/Storages/Distributed/DistributedSink.cpp | 2 +- src/Storages/TTLDescription.cpp | 2 +- tests/ci/stress.py | 2 +- .../configs/enable_deflateqpl_codec.xml | 2 +- .../test_non_default_compression/test.py | 32 ++++++++-------- tests/performance/codecs_float_insert.xml | 3 +- tests/performance/codecs_float_select.xml | 3 +- tests/performance/codecs_int_insert.xml | 3 +- tests/performance/codecs_int_select.xml | 3 +- ...04_test_alter_compression_codecs.reference | 4 +- .../00804_test_alter_compression_codecs.sql | 10 ++--- ...4_test_custom_compression_codecs.reference | 6 +-- .../00804_test_custom_compression_codecs.sql | 13 +++---- ..._custom_compression_codes_log_storages.sql | 2 +- ...st_deflate_qpl_codec_compression.reference | 6 +-- ...804_test_deflate_qpl_codec_compression.sql | 16 ++++---- ...804_test_delta_codec_compression.reference | 2 - .../00804_test_delta_codec_compression.sql | 38 ------------------- 27 files changed, 71 insertions(+), 116 deletions(-) diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index b0865ad2896..d0e17410791 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -380,7 +380,7 @@ High compression levels are useful for asymmetric scenarios, like compress once, `DEFLATE_QPL` — [Deflate compression algorithm](https://github.com/intel/qpl) implemented by Intel® Query Processing Library. Some limitations apply: -- DEFLATE_QPL is disabled by default and can only be used after setting configuration parameter `enable_qpl_deflate=1`. +- DEFLATE_QPL is disabled by default and can only be used after setting configuration parameter `enable_qpl_deflate_codec=1`. - DEFLATE_QPL requires a ClickHouse build compiled with SSE 4.2 instructions (by default, this is the case). Refer to [Build Clickhouse with DEFLATE_QPL](/docs/en/development/building_and_benchmarking_deflate_qpl.md/#Build-Clickhouse-with-DEFLATE_QPL) for more details. - DEFLATE_QPL works best if the system has a Intel® IAA (In-Memory Analytics Accelerator) offloading device. Refer to [Accelerator Configuration](https://intel.github.io/qpl/documentation/get_started_docs/installation.html#accelerator-configuration) and [Benchmark with DEFLATE_QPL](/docs/en/development/building_and_benchmarking_deflate_qpl.md/#Run-Benchmark-with-DEFLATE_QPL) for more details. - DEFLATE_QPL-compressed data can only be transferred between ClickHouse nodes compiled with SSE 4.2 enabled. diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 68bc3b39a56..ac8e6654e84 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -588,7 +588,7 @@ void Connection::sendQuery( if (method == "ZSTD") level = settings->network_zstd_compression_level; - CompressionCodecFactory::instance().validateCodec(method, level, !settings->allow_suspicious_codecs, settings->allow_experimental_codecs, settings->enable_qpl_deflate); + CompressionCodecFactory::instance().validateCodec(method, level, !settings->allow_suspicious_codecs, settings->allow_experimental_codecs, settings->enable_qpl_deflate_codec); compression_codec = CompressionCodecFactory::instance().get(method, level); } else diff --git a/src/Compression/CompressionFactory.h b/src/Compression/CompressionFactory.h index 1fdaf4f1c71..e020e51bb09 100644 --- a/src/Compression/CompressionFactory.h +++ b/src/Compression/CompressionFactory.h @@ -40,10 +40,10 @@ public: CompressionCodecPtr getDefaultCodec() const; /// Validate codecs AST specified by user and parses codecs description (substitute default parameters) - ASTPtr validateCodecAndGetPreprocessedAST(const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_qpl_deflate) const; + ASTPtr validateCodecAndGetPreprocessedAST(const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_qpl_deflate_codec) const; /// Validate codecs AST specified by user - void validateCodec(const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs, bool enable_qpl_deflate) const; + void validateCodec(const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs, bool enable_qpl_deflate_codec) const; /// Get codec by AST and possible column_type. Some codecs can use /// information about type to improve inner settings, but every codec should diff --git a/src/Compression/CompressionFactoryAdditions.cpp b/src/Compression/CompressionFactoryAdditions.cpp index 2630326238a..b4a2d96cf39 100644 --- a/src/Compression/CompressionFactoryAdditions.cpp +++ b/src/Compression/CompressionFactoryAdditions.cpp @@ -34,7 +34,7 @@ namespace ErrorCodes void CompressionCodecFactory::validateCodec( - const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs, bool enable_qpl_deflate) const + const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs, bool enable_qpl_deflate_codec) const { if (family_name.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Compression codec name cannot be empty"); @@ -43,13 +43,13 @@ void CompressionCodecFactory::validateCodec( { auto literal = std::make_shared(static_cast(*level)); validateCodecAndGetPreprocessedAST(makeASTFunction("CODEC", makeASTFunction(Poco::toUpper(family_name), literal)), - {}, sanity_check, allow_experimental_codecs, enable_qpl_deflate); + {}, sanity_check, allow_experimental_codecs, enable_qpl_deflate_codec); } else { auto identifier = std::make_shared(Poco::toUpper(family_name)); validateCodecAndGetPreprocessedAST(makeASTFunction("CODEC", identifier), - {}, sanity_check, allow_experimental_codecs, enable_qpl_deflate); + {}, sanity_check, allow_experimental_codecs, enable_qpl_deflate_codec); } } @@ -77,7 +77,7 @@ bool innerDataTypeIsFloat(const DataTypePtr & type) } ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST( - const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_qpl_deflate) const + const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_qpl_deflate_codec) const { if (const auto * func = ast->as()) { @@ -159,10 +159,10 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST( " You can enable it with the 'allow_experimental_codecs' setting.", codec_family_name); - if (!enable_qpl_deflate && result_codec->isDeflateQplCompression()) + if (!enable_qpl_deflate_codec && result_codec->isDeflateQplCompression()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Codec {} is disabled by default." - " You can enable it with the 'enable_qpl_deflate' setting.", + " You can enable it with the 'enable_qpl_deflate_codec' setting.", codec_family_name); codecs_descriptions->children.emplace_back(result_codec->getCodecDesc()); diff --git a/src/Compression/ICompressionCodec.h b/src/Compression/ICompressionCodec.h index d92ad3fc718..f7e8f4e43d2 100644 --- a/src/Compression/ICompressionCodec.h +++ b/src/Compression/ICompressionCodec.h @@ -109,12 +109,12 @@ public: /// It will not be allowed to use unless the user will turn off the safety switch. virtual bool isExperimental() const { return false; } - /// If it does nothing. - virtual bool isNone() const { return false; } - /// This is a knob for Deflate QPL codec. virtual bool isDeflateQplCompression() const { return false; } + /// If it does nothing. + virtual bool isNone() const { return false; } + protected: /// This is used for fuzz testing friend int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size); diff --git a/src/Core/Settings.h b/src/Core/Settings.h index c6a2069e6ae..4aae8f5d572 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -319,7 +319,7 @@ class IColumn; M(Bool, allow_distributed_ddl, true, "If it is set to true, then a user is allowed to executed distributed DDL queries.", 0) \ M(Bool, allow_suspicious_codecs, false, "If it is set to true, allow to specify meaningless compression codecs.", 0) \ M(Bool, allow_experimental_codecs, false, "If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).", 0) \ - M(Bool, enable_qpl_deflate, false, "If it is set to true, allow to use deflate_qpl for compression.", 0) \ + M(Bool, enable_qpl_deflate_codec, false, "If it is set to true, allow usage of the DEFLATE_QPL codec.", 0) \ M(UInt64, query_profiler_real_time_period_ns, QUERY_PROFILER_DEFAULT_SAMPLE_RATE_NS, "Period for real clock timer of query profiler (in nanoseconds). Set 0 value to turn off the real clock query profiler. Recommended value is at least 10000000 (100 times a second) for single queries or 1000000000 (once a second) for cluster-wide profiling.", 0) \ M(UInt64, query_profiler_cpu_time_period_ns, QUERY_PROFILER_DEFAULT_SAMPLE_RATE_NS, "Period for CPU clock timer of query profiler (in nanoseconds). Set 0 value to turn off the CPU clock query profiler. Recommended value is at least 10000000 (100 times a second) for single queries or 1000000000 (once a second) for cluster-wide profiling.", 0) \ M(Bool, metrics_perf_events_enabled, false, "If enabled, some of the perf events will be measured throughout queries' execution.", 0) \ diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 5c22b46b360..ddb53bbbfaa 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -571,7 +571,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( bool sanity_check_compression_codecs = !attach && !context_->getSettingsRef().allow_suspicious_codecs; bool allow_experimental_codecs = attach || context_->getSettingsRef().allow_experimental_codecs; - bool enable_qpl_deflate = attach || context_->getSettingsRef().enable_qpl_deflate; + bool enable_qpl_deflate_codec = attach || context_->getSettingsRef().enable_qpl_deflate_codec; ColumnsDescription res; auto name_type_it = column_names_and_types.begin(); @@ -632,7 +632,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( if (col_decl.default_specifier == "ALIAS") throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot specify codec for column type ALIAS"); column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST( - col_decl.codec, column.type, sanity_check_compression_codecs, allow_experimental_codecs, enable_qpl_deflate); + col_decl.codec, column.type, sanity_check_compression_codecs, allow_experimental_codecs, enable_qpl_deflate_codec); } if (col_decl.ttl) diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 96c585e7d16..b43fef9dd54 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -1775,7 +1775,7 @@ void TCPHandler::initBlockOutput(const Block & block) if (state.compression == Protocol::Compression::Enable) { - CompressionCodecFactory::instance().validateCodec(method, level, !query_settings.allow_suspicious_codecs, query_settings.allow_experimental_codecs, query_settings.enable_qpl_deflate); + CompressionCodecFactory::instance().validateCodec(method, level, !query_settings.allow_suspicious_codecs, query_settings.allow_experimental_codecs, query_settings.enable_qpl_deflate_codec); state.maybe_compressed_out = std::make_shared( *out, CompressionCodecFactory::instance().get(method, level)); diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index ecbddfc3e2a..73d7be8dc56 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -1067,7 +1067,7 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const "this column name is reserved for lightweight delete feature", backQuote(column_name)); if (command.codec) - CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_qpl_deflate); + CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_qpl_deflate_codec); all_columns.add(ColumnDescription(column_name, command.data_type)); } @@ -1093,7 +1093,7 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const { if (all_columns.hasAlias(column_name)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot specify codec for column type ALIAS"); - CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_qpl_deflate); + CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_qpl_deflate_codec); } auto column_default = all_columns.getDefault(column_name); if (column_default) diff --git a/src/Storages/Distributed/DistributedSink.cpp b/src/Storages/Distributed/DistributedSink.cpp index ce1dbde8eae..e383890d1f7 100644 --- a/src/Storages/Distributed/DistributedSink.cpp +++ b/src/Storages/Distributed/DistributedSink.cpp @@ -733,7 +733,7 @@ void DistributedSink::writeToShard(const Cluster::ShardInfo & shard_info, const if (compression_method == "ZSTD") compression_level = settings.network_zstd_compression_level; - CompressionCodecFactory::instance().validateCodec(compression_method, compression_level, !settings.allow_suspicious_codecs, settings.allow_experimental_codecs, settings.enable_qpl_deflate); + CompressionCodecFactory::instance().validateCodec(compression_method, compression_level, !settings.allow_suspicious_codecs, settings.allow_experimental_codecs, settings.enable_qpl_deflate_codec); CompressionCodecPtr compression_codec = CompressionCodecFactory::instance().get(compression_method, compression_level); /// tmp directory is used to ensure atomicity of transactions diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index f5209cbdff6..a437465b3fe 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -285,7 +285,7 @@ TTLDescription TTLDescription::getTTLFromAST( { result.recompression_codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST( - ttl_element->recompression_codec, {}, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_qpl_deflate); + ttl_element->recompression_codec, {}, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_qpl_deflate_codec); } } diff --git a/tests/ci/stress.py b/tests/ci/stress.py index b95cac9044e..e5ceb251d0f 100755 --- a/tests/ci/stress.py +++ b/tests/ci/stress.py @@ -20,7 +20,7 @@ def get_options(i, upgrade_check): '''--db-engine="Replicated('/test/db/test_{}', 's1', 'r1')"'''.format(i) ) client_options.append("allow_experimental_database_replicated=1") - client_options.append("enable_qpl_deflate=1") + client_options.append("enable_qpl_deflate_codec=1") # If database name is not specified, new database is created for each functional test. # Run some threads with one database for all tests. diff --git a/tests/integration/test_non_default_compression/configs/enable_deflateqpl_codec.xml b/tests/integration/test_non_default_compression/configs/enable_deflateqpl_codec.xml index 46e9e43ca27..521b0fd663c 100644 --- a/tests/integration/test_non_default_compression/configs/enable_deflateqpl_codec.xml +++ b/tests/integration/test_non_default_compression/configs/enable_deflateqpl_codec.xml @@ -1,7 +1,7 @@ - 1 + 1 diff --git a/tests/integration/test_non_default_compression/test.py b/tests/integration/test_non_default_compression/test.py index e69b32daae0..e1a9c1ae540 100644 --- a/tests/integration/test_non_default_compression/test.py +++ b/tests/integration/test_non_default_compression/test.py @@ -38,16 +38,16 @@ node5 = cluster.add_instance( ) node6 = cluster.add_instance( "node6", - main_configs=["configs/allow_experimental_codecs.xml"], - user_configs=["configs/allow_suspicious_codecs.xml"], + main_configs=["configs/deflateqpl_compression_by_default.xml"], + user_configs=[ + "configs/allow_suspicious_codecs.xml", + "configs/enable_deflateqpl_codec.xml", + ], ) node7 = cluster.add_instance( "node7", - main_configs=["configs/deflateqpl_compression_by_default.xml"], - user_configs=[ - "configs/enable_deflateqpl_codec.xml", - "configs/allow_suspicious_codecs.xml", - ], + main_configs=["configs/allow_experimental_codecs.xml"], + user_configs=["configs/allow_suspicious_codecs.xml"], ) @pytest.fixture(scope="module") @@ -253,7 +253,7 @@ def test_uncompressed_cache_plus_zstd_codec(start_cluster): ) def test_preconfigured_deflateqpl_codec(start_cluster): - node7.query( + node6.query( """ CREATE TABLE compression_codec_multiple_with_key ( somedate Date CODEC(ZSTD, ZSTD, ZSTD(12), LZ4HC(12), DEFLATE_QPL), @@ -263,46 +263,46 @@ def test_preconfigured_deflateqpl_codec(start_cluster): ) ENGINE = MergeTree() PARTITION BY somedate ORDER BY id SETTINGS index_granularity = 2; """ ) - node7.query( + node6.query( "INSERT INTO compression_codec_multiple_with_key VALUES(toDate('2018-10-12'), 100000, 'hello', 88.88), (toDate('2018-10-12'), 100002, 'world', 99.99), (toDate('2018-10-12'), 1111, '!', 777.777)" ) assert ( - node7.query( + node6.query( "SELECT COUNT(*) FROM compression_codec_multiple_with_key WHERE id % 2 == 0" ) == "2\n" ) assert ( - node7.query( + node6.query( "SELECT DISTINCT somecolumn FROM compression_codec_multiple_with_key ORDER BY id" ) == "777.777\n88.88\n99.99\n" ) assert ( - node7.query( + node6.query( "SELECT data FROM compression_codec_multiple_with_key WHERE id >= 1112 AND somedate = toDate('2018-10-12') AND somecolumn <= 100" ) == "hello\nworld\n" ) - node7.query( + node6.query( "INSERT INTO compression_codec_multiple_with_key SELECT toDate('2018-10-12'), number, toString(number), 1.0 FROM system.numbers LIMIT 10000" ) assert ( - node7.query( + node6.query( "SELECT COUNT(id) FROM compression_codec_multiple_with_key WHERE id % 10 == 0" ) == "1001\n" ) assert ( - node7.query( + node6.query( "SELECT SUM(somecolumn) FROM compression_codec_multiple_with_key" ) == str(777.777 + 88.88 + 99.99 + 1.0 * 10000) + "\n" ) assert ( - node7.query( + node6.query( "SELECT count(*) FROM compression_codec_multiple_with_key GROUP BY somedate" ) == "10003\n" diff --git a/tests/performance/codecs_float_insert.xml b/tests/performance/codecs_float_insert.xml index 25291f7f499..be0935ad4cf 100644 --- a/tests/performance/codecs_float_insert.xml +++ b/tests/performance/codecs_float_insert.xml @@ -1,7 +1,7 @@ 1 - 1 + 1 @@ -11,7 +11,6 @@ NONE LZ4 ZSTD - DEFLATE_QPL DoubleDelta Gorilla FPC diff --git a/tests/performance/codecs_float_select.xml b/tests/performance/codecs_float_select.xml index bb67987c75e..844ab4508d8 100644 --- a/tests/performance/codecs_float_select.xml +++ b/tests/performance/codecs_float_select.xml @@ -1,7 +1,7 @@ 1 - 1 + 1 @@ -11,7 +11,6 @@ NONE LZ4 ZSTD - DEFLATE_QPL DoubleDelta Gorilla FPC diff --git a/tests/performance/codecs_int_insert.xml b/tests/performance/codecs_int_insert.xml index 1db9ee8f746..d5f12810118 100644 --- a/tests/performance/codecs_int_insert.xml +++ b/tests/performance/codecs_int_insert.xml @@ -1,7 +1,7 @@ 1 - 1 + 1 @@ -11,7 +11,6 @@ NONE LZ4 ZSTD - DEFLATE_QPL Delta T64 DoubleDelta diff --git a/tests/performance/codecs_int_select.xml b/tests/performance/codecs_int_select.xml index 5dc7ab48704..06b2c2a73f3 100644 --- a/tests/performance/codecs_int_select.xml +++ b/tests/performance/codecs_int_select.xml @@ -1,7 +1,7 @@ 1 - 1 + 1 @@ -11,7 +11,6 @@ NONE LZ4 ZSTD - DEFLATE_QPL Delta T64 DoubleDelta diff --git a/tests/queries/0_stateless/00804_test_alter_compression_codecs.reference b/tests/queries/0_stateless/00804_test_alter_compression_codecs.reference index a6afe11126c..5c77a102740 100644 --- a/tests/queries/0_stateless/00804_test_alter_compression_codecs.reference +++ b/tests/queries/0_stateless/00804_test_alter_compression_codecs.reference @@ -12,15 +12,13 @@ CODEC(NONE) 2018-01-01 4 4 2018-01-01 5 5 2018-01-01 6 6 -CODEC(DEFLATE_QPL) 2018-01-01 1 default_value 2018-01-01 2 default_value 2018-01-01 3 3 2018-01-01 4 4 2018-01-01 5 5 2018-01-01 6 6 -2018-01-01 7 7 -2018-01-01 8 8 +CODEC(DEFLATE_QPL) 2018-01-01 1 default_value 2018-01-01 2 default_value 2018-01-01 3 3 diff --git a/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql b/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql index 40a8bb4c7cb..5b8b73270a2 100644 --- a/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql +++ b/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql @@ -25,7 +25,10 @@ INSERT INTO alter_compression_codec VALUES('2018-01-01', 5, '5'); INSERT INTO alter_compression_codec VALUES('2018-01-01', 6, '6'); SELECT * FROM alter_compression_codec ORDER BY id; -SET enable_qpl_deflate = 1; +OPTIMIZE TABLE alter_compression_codec FINAL; +SELECT * FROM alter_compression_codec ORDER BY id; + +SET enable_qpl_deflate_codec = 1; ALTER TABLE alter_compression_codec MODIFY COLUMN alter_column CODEC(DEFLATE_QPL); SELECT compression_codec FROM system.columns WHERE database = currentDatabase() AND table = 'alter_compression_codec' AND name = 'alter_column'; @@ -33,9 +36,6 @@ INSERT INTO alter_compression_codec VALUES('2018-01-01', 7, '7'); INSERT INTO alter_compression_codec VALUES('2018-01-01', 8, '8'); SELECT * FROM alter_compression_codec ORDER BY id; -OPTIMIZE TABLE alter_compression_codec FINAL; -SELECT * FROM alter_compression_codec ORDER BY id; - SET allow_suspicious_codecs = 1; ALTER TABLE alter_compression_codec MODIFY COLUMN alter_column CODEC(ZSTD, LZ4HC, LZ4, LZ4, DEFLATE_QPL, NONE); SELECT compression_codec FROM system.columns WHERE database = currentDatabase() AND table = 'alter_compression_codec' AND name = 'alter_column'; @@ -62,7 +62,7 @@ ALTER TABLE alter_bad_codec ADD COLUMN alter_column DateTime DEFAULT '2019-01-01 ALTER TABLE alter_bad_codec ADD COLUMN alter_column DateTime DEFAULT '2019-01-01 00:00:00' CODEC(ZSTD(100)); -- { serverError 433 } -ALTER TABLE alter_bad_codec ADD COLUMN alter_column DateTime DEFAULT '2019-01-01 00:00:00' CODEC(DEFLATE_QPL(100)); -- { serverError 378 } +ALTER TABLE alter_bad_codec ADD COLUMN alter_column DateTime DEFAULT '2019-01-01 00:00:00' CODEC(DEFLATE_QPL(100)); -- { serverError DATA_TYPE_CANNOT_HAVE_ARGUMENTS } DROP TABLE IF EXISTS alter_bad_codec; diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codecs.reference b/tests/queries/0_stateless/00804_test_custom_compression_codecs.reference index a9cbe3d32d3..8b51d65004a 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codecs.reference +++ b/tests/queries/0_stateless/00804_test_custom_compression_codecs.reference @@ -1,6 +1,6 @@ -1 hello 2018-12-14 1.1 aaa 5 qpl11 11 -2 world 2018-12-15 2.2 bbb 6 qpl22 22 -3 ! 2018-12-16 3.3 ccc 7 qpl33 33 +1 hello 2018-12-14 2018-12-14 1.1 aaa 5 +2 world 2018-12-15 2018-12-15 2.2 bbb 6 +3 ! 2018-12-16 2018-12-16 3.3 ccc 7 2 1 world 2018-10-05 1.1 2 hello 2018-10-01 2.2 diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql b/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql index 44a0daada27..47ec268bfec 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql +++ b/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql @@ -1,6 +1,6 @@ SET send_logs_level = 'fatal'; SET allow_suspicious_codecs = 1; -SET enable_qpl_deflate = 1; +SET enable_qpl_deflate_codec = 1; DROP TABLE IF EXISTS compression_codec; @@ -8,22 +8,21 @@ CREATE TABLE compression_codec( id UInt64 CODEC(LZ4), data String CODEC(ZSTD), ddd Date CODEC(NONE), + ddd32 Date32 CODEC(DEFLATE_QPL), somenum Float64 CODEC(ZSTD(2)), somestr FixedString(3) CODEC(LZ4HC(7)), othernum Int64 CODEC(Delta), - qplstr String CODEC(DEFLATE_QPL), - qplnum UInt32 CODEC(DEFLATE_QPL), ) ENGINE = MergeTree() ORDER BY tuple(); -INSERT INTO compression_codec VALUES(1, 'hello', toDate('2018-12-14'), 1.1, 'aaa', 5, 'qpl11', 11); -INSERT INTO compression_codec VALUES(2, 'world', toDate('2018-12-15'), 2.2, 'bbb', 6,'qpl22', 22); -INSERT INTO compression_codec VALUES(3, '!', toDate('2018-12-16'), 3.3, 'ccc', 7, 'qpl33', 33); +INSERT INTO compression_codec VALUES(1, 'hello', toDate('2018-12-14'), toDate32('2018-12-14'), 1.1, 'aaa', 5); +INSERT INTO compression_codec VALUES(2, 'world', toDate('2018-12-15'), toDate32('2018-12-15'), 2.2, 'bbb', 6); +INSERT INTO compression_codec VALUES(3, '!', toDate('2018-12-16'), toDate32('2018-12-16'), 3.3, 'ccc', 7); SELECT * FROM compression_codec ORDER BY id; OPTIMIZE TABLE compression_codec FINAL; -INSERT INTO compression_codec VALUES(2, '', toDate('2018-12-13'), 4.4, 'ddd', 8, 'qpl44', 44); +INSERT INTO compression_codec VALUES(2, '', toDate('2018-12-13'), toDate32('2018-12-13'), 4.4, 'ddd', 8); DETACH TABLE compression_codec; ATTACH TABLE compression_codec; diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql b/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql index 113f26732e7..bcd09277824 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql +++ b/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql @@ -1,6 +1,6 @@ SET send_logs_level = 'fatal'; SET allow_suspicious_codecs = 1; -SET enable_qpl_deflate = 1; +SET enable_qpl_deflate_codec = 1; -- copy-paste for storage log diff --git a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.reference b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.reference index 88d274d9cba..276747f8233 100644 --- a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.reference +++ b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.reference @@ -1,4 +1,4 @@ -1 hello 2018-12-14 1.1 aaa 5 qpl11 11 -2 world 2018-12-15 2.2 bbb 6 qpl22 22 -3 ! 2018-12-16 3.3 ccc 7 qpl33 33 +1 hello 2018-12-14 2018-12-14 1.1 aaa 5 [1,2,3] {'k1':1,'k2':2} (1,2) +2 world 2018-12-15 2018-12-15 2.2 bbb 6 [4,5,6] {'k3':3,'k4':4} (3,4) +3 ! 2018-12-16 2018-12-16 3.3 ccc 7 [7,8,9] {'k5':5,'k6':6} (5,6) 2 diff --git a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql index fe23e49804d..64e66d47522 100644 --- a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql +++ b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql @@ -1,5 +1,5 @@ SET send_logs_level = 'fatal'; -SET enable_qpl_deflate = 1; +SET enable_qpl_deflate_codec = 1; DROP TABLE IF EXISTS compression_codec; @@ -7,22 +7,24 @@ CREATE TABLE compression_codec( id UInt64 CODEC(DEFLATE_QPL), data String CODEC(DEFLATE_QPL), ddd Date CODEC(DEFLATE_QPL), + ddd32 Date32 CODEC(DEFLATE_QPL), somenum Float64 CODEC(DEFLATE_QPL), somestr FixedString(3) CODEC(DEFLATE_QPL), othernum Int64 CODEC(DEFLATE_QPL), - qplstr String CODEC(DEFLATE_QPL), - qplnum UInt32 CODEC(DEFLATE_QPL), + somearray Array(UInt8) CODEC(DEFLATE_QPL), + somemap Map(String, UInt32) CODEC(DEFLATE_QPL), + sometuple Tuple(UInt16, UInt64) CODEC(DEFLATE_QPL), ) ENGINE = MergeTree() ORDER BY tuple(); -INSERT INTO compression_codec VALUES(1, 'hello', toDate('2018-12-14'), 1.1, 'aaa', 5, 'qpl11', 11); -INSERT INTO compression_codec VALUES(2, 'world', toDate('2018-12-15'), 2.2, 'bbb', 6,'qpl22', 22); -INSERT INTO compression_codec VALUES(3, '!', toDate('2018-12-16'), 3.3, 'ccc', 7, 'qpl33', 33); +INSERT INTO compression_codec VALUES(1, 'hello', toDate('2018-12-14'), toDate32('2018-12-14'), 1.1, 'aaa', 5, [1,2,3], map('k1',1,'k2',2), tuple(1,2)); +INSERT INTO compression_codec VALUES(2, 'world', toDate('2018-12-15'), toDate32('2018-12-15'), 2.2, 'bbb', 6, [4,5,6], map('k3',3,'k4',4), tuple(3,4)); +INSERT INTO compression_codec VALUES(3, '!', toDate('2018-12-16'), toDate32('2018-12-16'), 3.3, 'ccc', 7, [7,8,9], map('k5',5,'k6',6), tuple(5,6)); SELECT * FROM compression_codec ORDER BY id; OPTIMIZE TABLE compression_codec FINAL; -INSERT INTO compression_codec VALUES(2, '', toDate('2018-12-13'), 4.4, 'ddd', 8, 'qpl44', 44); +INSERT INTO compression_codec VALUES(2, '', toDate('2018-12-13'), toDate32('2018-12-13'), 4.4, 'ddd', 8, [10,11,12], map('k7',7,'k8',8), tuple(7,8)); DETACH TABLE compression_codec; ATTACH TABLE compression_codec; diff --git a/tests/queries/0_stateless/00804_test_delta_codec_compression.reference b/tests/queries/0_stateless/00804_test_delta_codec_compression.reference index 37f9d4901b3..949d37ed27a 100644 --- a/tests/queries/0_stateless/00804_test_delta_codec_compression.reference +++ b/tests/queries/0_stateless/00804_test_delta_codec_compression.reference @@ -4,5 +4,3 @@ 1 32 1 -17 -1 diff --git a/tests/queries/0_stateless/00804_test_delta_codec_compression.sql b/tests/queries/0_stateless/00804_test_delta_codec_compression.sql index f9805246662..25988f6474b 100644 --- a/tests/queries/0_stateless/00804_test_delta_codec_compression.sql +++ b/tests/queries/0_stateless/00804_test_delta_codec_compression.sql @@ -115,41 +115,3 @@ USING(key); DROP TABLE IF EXISTS delta_codec_string; DROP TABLE IF EXISTS default_codec_string; - -SET enable_qpl_deflate = 1; -DROP TABLE IF EXISTS delta_codec_string_qpl; -DROP TABLE IF EXISTS default_codec_string_qpl; - -CREATE TABLE delta_codec_string_qpl -( - id Float64 Codec(Delta, DEFLATE_QPL) -) ENGINE MergeTree() ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0, compress_marks = false, compress_primary_key=false; - -CREATE TABLE default_codec_string_qpl -( - id Float64 Codec(DEFLATE_QPL) -) ENGINE MergeTree() ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0, compress_marks = false, compress_primary_key=false; - -INSERT INTO delta_codec_string_qpl SELECT concat(toString(number), toString(number % 100)) FROM numbers(1547510400, 500000); -INSERT INTO default_codec_string_qpl SELECT * from delta_codec_string_qpl; - -OPTIMIZE TABLE delta_codec_string_qpl FINAL; -OPTIMIZE TABLE default_codec_string_qpl FINAL; - -SELECT - floor(big_size / small_size) as ratio -FROM - (SELECT 1 AS key, sum(bytes_on_disk) AS small_size FROM system.parts WHERE database = currentDatabase() and table = 'delta_codec_string_qpl' and active) -INNER JOIN - (SELECT 1 AS key, sum(bytes_on_disk) as big_size FROM system.parts WHERE database = currentDatabase() and table = 'default_codec_string_qpl' and active) USING(key); - -SELECT - small_hash == big_hash -FROM - (SELECT 1 AS key, sum(cityHash64(*)) AS small_hash FROM delta_codec_string_qpl) -INNER JOIN - (SELECT 1 AS key, sum(cityHash64(*)) AS big_hash FROM default_codec_string_qpl) -USING(key); - -DROP TABLE IF EXISTS delta_codec_string_qpl; -DROP TABLE IF EXISTS default_codec_string_qpl; From aae281eb7df6ce8e00d872d3ef0d0558781a5f1a Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 1 Jun 2023 15:49:52 +0200 Subject: [PATCH 0712/1072] Update codecs_float_insert.xml --- tests/performance/codecs_float_insert.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/performance/codecs_float_insert.xml b/tests/performance/codecs_float_insert.xml index be0935ad4cf..64325d30189 100644 --- a/tests/performance/codecs_float_insert.xml +++ b/tests/performance/codecs_float_insert.xml @@ -1,7 +1,6 @@ 1 - 1 From dc93b6324ee505228b96791db629b7437f6db931 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 1 Jun 2023 15:50:28 +0200 Subject: [PATCH 0713/1072] Update codecs_float_select.xml --- tests/performance/codecs_float_select.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/performance/codecs_float_select.xml b/tests/performance/codecs_float_select.xml index 844ab4508d8..325c140d9a0 100644 --- a/tests/performance/codecs_float_select.xml +++ b/tests/performance/codecs_float_select.xml @@ -1,7 +1,6 @@ 1 - 1 From 7043db669e4e445764d99cd749cfef99d3f437cf Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 1 Jun 2023 15:50:40 +0200 Subject: [PATCH 0714/1072] Update codecs_int_insert.xml --- tests/performance/codecs_int_insert.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/performance/codecs_int_insert.xml b/tests/performance/codecs_int_insert.xml index d5f12810118..618e20160f8 100644 --- a/tests/performance/codecs_int_insert.xml +++ b/tests/performance/codecs_int_insert.xml @@ -1,7 +1,6 @@ 1 - 1 From 4d7364af97893c4457a86a064628ff478d900c05 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 1 Jun 2023 15:50:49 +0200 Subject: [PATCH 0715/1072] Update codecs_int_select.xml --- tests/performance/codecs_int_select.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/performance/codecs_int_select.xml b/tests/performance/codecs_int_select.xml index 06b2c2a73f3..62c1ee16e7b 100644 --- a/tests/performance/codecs_int_select.xml +++ b/tests/performance/codecs_int_select.xml @@ -1,7 +1,6 @@ 1 - 1 From 1f928f2d3d0eea55ff1743cea386162fd87fed92 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 1 Jun 2023 15:53:48 +0200 Subject: [PATCH 0716/1072] Update StorageSystemBuildOptions.cpp.in --- src/Storages/System/StorageSystemBuildOptions.cpp.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/System/StorageSystemBuildOptions.cpp.in b/src/Storages/System/StorageSystemBuildOptions.cpp.in index c2a188e7750..c2d35c96ce5 100644 --- a/src/Storages/System/StorageSystemBuildOptions.cpp.in +++ b/src/Storages/System/StorageSystemBuildOptions.cpp.in @@ -64,11 +64,11 @@ const char * auto_config_build[] "USE_ARROW", "@USE_ARROW@", "USE_ORC", "@USE_ORC@", "USE_MSGPACK", "@USE_MSGPACK@", + "USE_QPL", "@ENABLE_QPL@", "GIT_HASH", "@GIT_HASH@", "GIT_BRANCH", R"IRjaNsZIL9Yh7FQ4(@GIT_BRANCH@)IRjaNsZIL9Yh7FQ4", "GIT_DATE", "@GIT_DATE@", "GIT_COMMIT_SUBJECT", R"Gi17KJMlbGCjErEN(@GIT_COMMIT_SUBJECT@)Gi17KJMlbGCjErEN", - "USE_QPL", "@ENABLE_QPL@", nullptr, nullptr }; From 1aa158909e434438733504d2dbcd9ea9d113e41b Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 9 Jun 2023 12:38:38 +0000 Subject: [PATCH 0717/1072] enable_qpl_deflate_codec --> enable_deflate_qpl_codec --- docs/en/sql-reference/statements/create/table.md | 2 +- src/Client/Connection.cpp | 2 +- src/Compression/CompressionFactory.h | 4 ++-- src/Compression/CompressionFactoryAdditions.cpp | 12 ++++++------ src/Core/Settings.h | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 4 ++-- src/Server/TCPHandler.cpp | 2 +- src/Storages/AlterCommands.cpp | 4 ++-- src/Storages/Distributed/DistributedSink.cpp | 2 +- src/Storages/TTLDescription.cpp | 2 +- tests/ci/stress.py | 2 +- .../configs/enable_deflateqpl_codec.xml | 2 +- .../00804_test_alter_compression_codecs.sql | 2 +- .../00804_test_custom_compression_codecs.sql | 2 +- ...04_test_custom_compression_codes_log_storages.sql | 2 +- .../00804_test_deflate_qpl_codec_compression.sql | 2 +- 16 files changed, 24 insertions(+), 24 deletions(-) diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index d0e17410791..496ecdbda7b 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -380,7 +380,7 @@ High compression levels are useful for asymmetric scenarios, like compress once, `DEFLATE_QPL` — [Deflate compression algorithm](https://github.com/intel/qpl) implemented by Intel® Query Processing Library. Some limitations apply: -- DEFLATE_QPL is disabled by default and can only be used after setting configuration parameter `enable_qpl_deflate_codec=1`. +- DEFLATE_QPL is disabled by default and can only be used after setting configuration parameter `enable_deflate_qpl_codec = 1`. - DEFLATE_QPL requires a ClickHouse build compiled with SSE 4.2 instructions (by default, this is the case). Refer to [Build Clickhouse with DEFLATE_QPL](/docs/en/development/building_and_benchmarking_deflate_qpl.md/#Build-Clickhouse-with-DEFLATE_QPL) for more details. - DEFLATE_QPL works best if the system has a Intel® IAA (In-Memory Analytics Accelerator) offloading device. Refer to [Accelerator Configuration](https://intel.github.io/qpl/documentation/get_started_docs/installation.html#accelerator-configuration) and [Benchmark with DEFLATE_QPL](/docs/en/development/building_and_benchmarking_deflate_qpl.md/#Run-Benchmark-with-DEFLATE_QPL) for more details. - DEFLATE_QPL-compressed data can only be transferred between ClickHouse nodes compiled with SSE 4.2 enabled. diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index ac8e6654e84..636532ade4b 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -588,7 +588,7 @@ void Connection::sendQuery( if (method == "ZSTD") level = settings->network_zstd_compression_level; - CompressionCodecFactory::instance().validateCodec(method, level, !settings->allow_suspicious_codecs, settings->allow_experimental_codecs, settings->enable_qpl_deflate_codec); + CompressionCodecFactory::instance().validateCodec(method, level, !settings->allow_suspicious_codecs, settings->allow_experimental_codecs, settings->enable_deflate_qpl_codec); compression_codec = CompressionCodecFactory::instance().get(method, level); } else diff --git a/src/Compression/CompressionFactory.h b/src/Compression/CompressionFactory.h index e020e51bb09..4f2627587a3 100644 --- a/src/Compression/CompressionFactory.h +++ b/src/Compression/CompressionFactory.h @@ -40,10 +40,10 @@ public: CompressionCodecPtr getDefaultCodec() const; /// Validate codecs AST specified by user and parses codecs description (substitute default parameters) - ASTPtr validateCodecAndGetPreprocessedAST(const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_qpl_deflate_codec) const; + ASTPtr validateCodecAndGetPreprocessedAST(const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_deflate_qpl_codec) const; /// Validate codecs AST specified by user - void validateCodec(const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs, bool enable_qpl_deflate_codec) const; + void validateCodec(const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs, bool enable_deflate_qpl_codec) const; /// Get codec by AST and possible column_type. Some codecs can use /// information about type to improve inner settings, but every codec should diff --git a/src/Compression/CompressionFactoryAdditions.cpp b/src/Compression/CompressionFactoryAdditions.cpp index b4a2d96cf39..46f7e2653c2 100644 --- a/src/Compression/CompressionFactoryAdditions.cpp +++ b/src/Compression/CompressionFactoryAdditions.cpp @@ -34,7 +34,7 @@ namespace ErrorCodes void CompressionCodecFactory::validateCodec( - const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs, bool enable_qpl_deflate_codec) const + const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs, bool enable_deflate_qpl_codec) const { if (family_name.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Compression codec name cannot be empty"); @@ -43,13 +43,13 @@ void CompressionCodecFactory::validateCodec( { auto literal = std::make_shared(static_cast(*level)); validateCodecAndGetPreprocessedAST(makeASTFunction("CODEC", makeASTFunction(Poco::toUpper(family_name), literal)), - {}, sanity_check, allow_experimental_codecs, enable_qpl_deflate_codec); + {}, sanity_check, allow_experimental_codecs, enable_deflate_qpl_codec); } else { auto identifier = std::make_shared(Poco::toUpper(family_name)); validateCodecAndGetPreprocessedAST(makeASTFunction("CODEC", identifier), - {}, sanity_check, allow_experimental_codecs, enable_qpl_deflate_codec); + {}, sanity_check, allow_experimental_codecs, enable_deflate_qpl_codec); } } @@ -77,7 +77,7 @@ bool innerDataTypeIsFloat(const DataTypePtr & type) } ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST( - const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_qpl_deflate_codec) const + const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_deflate_qpl_codec) const { if (const auto * func = ast->as()) { @@ -159,10 +159,10 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST( " You can enable it with the 'allow_experimental_codecs' setting.", codec_family_name); - if (!enable_qpl_deflate_codec && result_codec->isDeflateQplCompression()) + if (!enable_deflate_qpl_codec && result_codec->isDeflateQplCompression()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Codec {} is disabled by default." - " You can enable it with the 'enable_qpl_deflate_codec' setting.", + " You can enable it with the 'enable_deflate_qpl_codec' setting.", codec_family_name); codecs_descriptions->children.emplace_back(result_codec->getCodecDesc()); diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 4aae8f5d572..e0034174597 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -319,7 +319,7 @@ class IColumn; M(Bool, allow_distributed_ddl, true, "If it is set to true, then a user is allowed to executed distributed DDL queries.", 0) \ M(Bool, allow_suspicious_codecs, false, "If it is set to true, allow to specify meaningless compression codecs.", 0) \ M(Bool, allow_experimental_codecs, false, "If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).", 0) \ - M(Bool, enable_qpl_deflate_codec, false, "If it is set to true, allow usage of the DEFLATE_QPL codec.", 0) \ + M(Bool, enable_deflate_qpl_codec, false, "Enable/disable the DEFLATE_QPL codec.", 0) \ M(UInt64, query_profiler_real_time_period_ns, QUERY_PROFILER_DEFAULT_SAMPLE_RATE_NS, "Period for real clock timer of query profiler (in nanoseconds). Set 0 value to turn off the real clock query profiler. Recommended value is at least 10000000 (100 times a second) for single queries or 1000000000 (once a second) for cluster-wide profiling.", 0) \ M(UInt64, query_profiler_cpu_time_period_ns, QUERY_PROFILER_DEFAULT_SAMPLE_RATE_NS, "Period for CPU clock timer of query profiler (in nanoseconds). Set 0 value to turn off the CPU clock query profiler. Recommended value is at least 10000000 (100 times a second) for single queries or 1000000000 (once a second) for cluster-wide profiling.", 0) \ M(Bool, metrics_perf_events_enabled, false, "If enabled, some of the perf events will be measured throughout queries' execution.", 0) \ diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index ddb53bbbfaa..d0bb3dd389f 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -571,7 +571,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( bool sanity_check_compression_codecs = !attach && !context_->getSettingsRef().allow_suspicious_codecs; bool allow_experimental_codecs = attach || context_->getSettingsRef().allow_experimental_codecs; - bool enable_qpl_deflate_codec = attach || context_->getSettingsRef().enable_qpl_deflate_codec; + bool enable_deflate_qpl_codec = attach || context_->getSettingsRef().enable_deflate_qpl_codec; ColumnsDescription res; auto name_type_it = column_names_and_types.begin(); @@ -632,7 +632,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( if (col_decl.default_specifier == "ALIAS") throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot specify codec for column type ALIAS"); column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST( - col_decl.codec, column.type, sanity_check_compression_codecs, allow_experimental_codecs, enable_qpl_deflate_codec); + col_decl.codec, column.type, sanity_check_compression_codecs, allow_experimental_codecs, enable_deflate_qpl_codec); } if (col_decl.ttl) diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index b43fef9dd54..50e9d50e2f6 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -1775,7 +1775,7 @@ void TCPHandler::initBlockOutput(const Block & block) if (state.compression == Protocol::Compression::Enable) { - CompressionCodecFactory::instance().validateCodec(method, level, !query_settings.allow_suspicious_codecs, query_settings.allow_experimental_codecs, query_settings.enable_qpl_deflate_codec); + CompressionCodecFactory::instance().validateCodec(method, level, !query_settings.allow_suspicious_codecs, query_settings.allow_experimental_codecs, query_settings.enable_deflate_qpl_codec); state.maybe_compressed_out = std::make_shared( *out, CompressionCodecFactory::instance().get(method, level)); diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 73d7be8dc56..a9247f9b898 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -1067,7 +1067,7 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const "this column name is reserved for lightweight delete feature", backQuote(column_name)); if (command.codec) - CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_qpl_deflate_codec); + CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_deflate_qpl_codec); all_columns.add(ColumnDescription(column_name, command.data_type)); } @@ -1093,7 +1093,7 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const { if (all_columns.hasAlias(column_name)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot specify codec for column type ALIAS"); - CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_qpl_deflate_codec); + CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_deflate_qpl_codec); } auto column_default = all_columns.getDefault(column_name); if (column_default) diff --git a/src/Storages/Distributed/DistributedSink.cpp b/src/Storages/Distributed/DistributedSink.cpp index e383890d1f7..1e1c911920e 100644 --- a/src/Storages/Distributed/DistributedSink.cpp +++ b/src/Storages/Distributed/DistributedSink.cpp @@ -733,7 +733,7 @@ void DistributedSink::writeToShard(const Cluster::ShardInfo & shard_info, const if (compression_method == "ZSTD") compression_level = settings.network_zstd_compression_level; - CompressionCodecFactory::instance().validateCodec(compression_method, compression_level, !settings.allow_suspicious_codecs, settings.allow_experimental_codecs, settings.enable_qpl_deflate_codec); + CompressionCodecFactory::instance().validateCodec(compression_method, compression_level, !settings.allow_suspicious_codecs, settings.allow_experimental_codecs, settings.enale_deflate_qpl_codec); CompressionCodecPtr compression_codec = CompressionCodecFactory::instance().get(compression_method, compression_level); /// tmp directory is used to ensure atomicity of transactions diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index a437465b3fe..f601fed06ac 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -285,7 +285,7 @@ TTLDescription TTLDescription::getTTLFromAST( { result.recompression_codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST( - ttl_element->recompression_codec, {}, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_qpl_deflate_codec); + ttl_element->recompression_codec, {}, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_deflate_qpl_codec); } } diff --git a/tests/ci/stress.py b/tests/ci/stress.py index e5ceb251d0f..6d17384c63f 100755 --- a/tests/ci/stress.py +++ b/tests/ci/stress.py @@ -20,7 +20,7 @@ def get_options(i, upgrade_check): '''--db-engine="Replicated('/test/db/test_{}', 's1', 'r1')"'''.format(i) ) client_options.append("allow_experimental_database_replicated=1") - client_options.append("enable_qpl_deflate_codec=1") + client_options.append("enable_deflate_qpl_codec=1") # If database name is not specified, new database is created for each functional test. # Run some threads with one database for all tests. diff --git a/tests/integration/test_non_default_compression/configs/enable_deflateqpl_codec.xml b/tests/integration/test_non_default_compression/configs/enable_deflateqpl_codec.xml index 521b0fd663c..24e101e0e3f 100644 --- a/tests/integration/test_non_default_compression/configs/enable_deflateqpl_codec.xml +++ b/tests/integration/test_non_default_compression/configs/enable_deflateqpl_codec.xml @@ -1,7 +1,7 @@ - 1 + 1 diff --git a/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql b/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql index 5b8b73270a2..fd9855e82d3 100644 --- a/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql +++ b/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql @@ -28,7 +28,7 @@ SELECT * FROM alter_compression_codec ORDER BY id; OPTIMIZE TABLE alter_compression_codec FINAL; SELECT * FROM alter_compression_codec ORDER BY id; -SET enable_qpl_deflate_codec = 1; +SET enable_deflate_qpl_codec = 1; ALTER TABLE alter_compression_codec MODIFY COLUMN alter_column CODEC(DEFLATE_QPL); SELECT compression_codec FROM system.columns WHERE database = currentDatabase() AND table = 'alter_compression_codec' AND name = 'alter_column'; diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql b/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql index 47ec268bfec..89e77f758a7 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql +++ b/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql @@ -1,6 +1,6 @@ SET send_logs_level = 'fatal'; SET allow_suspicious_codecs = 1; -SET enable_qpl_deflate_codec = 1; +SET enable_deflate_qpl_codec = 1; DROP TABLE IF EXISTS compression_codec; diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql b/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql index bcd09277824..a629df2666d 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql +++ b/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql @@ -1,6 +1,6 @@ SET send_logs_level = 'fatal'; SET allow_suspicious_codecs = 1; -SET enable_qpl_deflate_codec = 1; +SET enable_deflate_qpl_codec = 1; -- copy-paste for storage log diff --git a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql index 64e66d47522..5a56fc0d576 100644 --- a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql +++ b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql @@ -1,5 +1,5 @@ SET send_logs_level = 'fatal'; -SET enable_qpl_deflate_codec = 1; +SET enable_deflate_qpl_codec = 1; DROP TABLE IF EXISTS compression_codec; From 3c1b02a37bfa349fbf3af86277c9ad3ae0eadc1c Mon Sep 17 00:00:00 2001 From: Victor Krasnov Date: Fri, 9 Jun 2023 15:43:36 +0300 Subject: [PATCH 0718/1072] Rectify the existing example of the year omission --- .../sql-reference/functions/type-conversion-functions.md | 8 ++++---- .../sql-reference/functions/type-conversion-functions.md | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index e62cf89a6b2..f1e2785285c 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1417,15 +1417,15 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffort('10 20:19'); +SELECT toYear(now()) as year, parseDateTimeBestEffort('10 20:19'); ``` Result: ```response -┌─parseDateTimeBestEffort('10 20:19')─┐ -│ 2000-01-10 20:19:00 │ -└─────────────────────────────────────┘ +┌─year─┬─parseDateTimeBestEffort('10 20:19')─┐ +│ 2023 │ 2023-01-10 20:19:00 │ +└──────┴─────────────────────────────────────┘ ``` **See Also** diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 6de55757b64..b763ee2b3ac 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -1096,15 +1096,15 @@ AS parseDateTimeBestEffort; Запрос: ``` sql -SELECT parseDateTimeBestEffort('10 20:19'); +SELECT toYear(now()) as year, parseDateTimeBestEffort('10 20:19'); ``` Результат: ``` text -┌─parseDateTimeBestEffort('10 20:19')─┐ -│ 2000-01-10 20:19:00 │ -└─────────────────────────────────────┘ +┌─year─┬─parseDateTimeBestEffort('10 20:19')─┐ +│ 2023 │ 2023-01-10 20:19:00 │ +└──────┴─────────────────────────────────────┘ ``` **Смотрите также** From 47b0c2a862c282b642a81ce0f0287c5059d717dc Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 9 Jun 2023 13:01:36 +0000 Subject: [PATCH 0719/1072] Make better --- src/Formats/CapnProtoSerializer.cpp | 117 +++++++++++++----- .../0_stateless/02030_capnp_format.reference | 11 ++ .../queries/0_stateless/02030_capnp_format.sh | 13 +- ...case_insensitive_names_matching.reference} | 0 ..._capnp_case_insensitive_names_matching.sh} | 0 5 files changed, 110 insertions(+), 31 deletions(-) rename tests/queries/0_stateless/{02735_capnp_case_insensitive_names_matcing.reference => 02735_capnp_case_insensitive_names_matching.reference} (100%) rename tests/queries/0_stateless/{02735_capnp_case_insensitive_names_matcing.sh => 02735_capnp_case_insensitive_names_matching.sh} (100%) diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp index e99db23bb5e..f51f8c4b933 100644 --- a/src/Formats/CapnProtoSerializer.cpp +++ b/src/Formats/CapnProtoSerializer.cpp @@ -110,7 +110,7 @@ namespace /// Write row as struct field. virtual void writeRow( const ColumnPtr & column, - std::unique_ptr & builder, + std::unique_ptr & builder, /// Maybe unused for simple types, needed to initialize structs and lists. capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) = 0; @@ -118,7 +118,7 @@ namespace /// Write row as list element. virtual void writeRow( const ColumnPtr & column, - std::unique_ptr & builder, + std::unique_ptr & builder, /// Maybe unused for simple types, needed to initialize structs and lists. capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) = 0; @@ -262,54 +262,93 @@ namespace if (!capnp_type.isEnum()) throwCannotConvert(data_type, column_name, capnp_type); - bool to_lower = enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE; const auto * enum_type = assert_cast *>(data_type.get()); const auto & enum_values = dynamic_cast &>(*enum_type); enum_schema = capnp_type.asEnum(); auto enumerants = enum_schema.getEnumerants(); - constexpr auto max_value = std::is_same_v ? INT8_MAX : INT16_MAX; if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) { - /// In CapnProto Enum fields are numbered sequentially starting from zero. - if (enumerants.size() > max_value) - throw Exception( - ErrorCodes::CAPN_PROTO_BAD_CAST, - "Enum from CapnProto schema contains values that are out of range for Clickhouse enum type {}", - data_type->getName()); - - auto values = enum_values.getSetOfAllValues(); - std::unordered_set capn_enum_values; + auto ch_enum_values = enum_values.getSetOfAllValues(); + std::unordered_set capn_enum_values; for (auto enumerant : enumerants) - capn_enum_values.insert(EnumType(enumerant.getOrdinal())); - if (values != capn_enum_values) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "The set of values in Enum from CapnProto schema is different from the set of values in ClickHouse Enum"); + capn_enum_values.insert(enumerant.getOrdinal()); + + /// Check if ClickHouse values is a superset of CapnProto values. + ch_enum_is_superset = true; + /// In CapnProto Enum fields are numbered sequentially starting from zero. + /// Check if max CapnProto value exceeds max ClickHouse value. + constexpr auto max_value = std::is_same_v ? INT8_MAX : INT16_MAX; + if (enumerants.size() > max_value) + { + ch_enum_is_superset = false; + } + else + { + for (auto capnp_value : capn_enum_values) + { + if (!ch_enum_values.contains(static_cast(capnp_value))) + { + ch_enum_is_superset = false; + break; + } + } + } + + /// Check if CapnProto values is a superset of ClickHouse values. + capnp_enum_is_superset = true; + for (auto ch_value : ch_enum_values) + { + /// Capnp doesn't support negative enum values. + if (ch_value < 0 || !capn_enum_values.contains(static_cast(ch_value))) + { + capnp_enum_is_superset = false; + break; + } + } } else { - auto all_values = enum_values.getValues(); - if (all_values.size() != enumerants.size()) - throw Exception( - ErrorCodes::CAPN_PROTO_BAD_CAST, - "The set of names in Enum from CapnProto schema is different from the set of names in ClickHouse Enum"); + bool to_lower = enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE; + auto all_values = enum_values.getValues(); std::unordered_map ch_name_to_value; for (auto & [name, value] : all_values) ch_name_to_value[to_lower ? boost::algorithm::to_lower_copy(name) : name] = value; + std::unordered_map capnp_name_to_value; for (auto enumerant : enumerants) { String capnp_name = enumerant.getProto().getName(); - UInt16 capnp_value = enumerant.getOrdinal(); - auto it = ch_name_to_value.find(to_lower ? boost::algorithm::to_lower_copy(capnp_name) : capnp_name); - if (it == ch_name_to_value.end()) - throw Exception( - ErrorCodes::CAPN_PROTO_BAD_CAST, - "The set of names in Enum from CapnProto schema is different from the set of names in ClickHouse Enum"); + capnp_name_to_value[to_lower ? boost::algorithm::to_lower_copy(capnp_name) : capnp_name] = enumerant.getOrdinal(); + } - ch_to_capnp_values[it->second] = capnp_value; + /// Check if ClickHouse names is a superset of CapnProto names. + ch_enum_is_superset = true; + for (auto & [capnp_name, capnp_value] : capnp_name_to_value) + { + auto it = ch_name_to_value.find(capnp_name); + if (it == ch_name_to_value.end()) + { + ch_enum_is_superset = false; + break; + } capnp_to_ch_values[capnp_value] = it->second; } + + /// Check if CapnProto names is a superset of ClickHouse names. + capnp_enum_is_superset = true; + + for (auto & [ch_name, ch_value] : ch_name_to_value) + { + auto it = capnp_name_to_value.find(ch_name); + if (it == capnp_name_to_value.end()) + { + capnp_enum_is_superset = false; + break; + } + ch_to_capnp_values[ch_value] = it->second; + } } } @@ -336,23 +375,43 @@ namespace private: UInt16 getValue(const ColumnPtr & column, size_t row_num) { + if (!capnp_enum_is_superset) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Cannot convert ClickHouse enum to CapnProto enum: CapnProto enum values/names is not a superset of ClickHouse enum values/names"); + EnumType enum_value = assert_cast &>(*column).getElement(row_num); if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) return static_cast(enum_value); - return ch_to_capnp_values[enum_value]; + auto it = ch_to_capnp_values.find(enum_value); + if (it == ch_to_capnp_values.end()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected value {} in ClickHouse enum", enum_value); + + return it->second; } void insertValue(IColumn & column, UInt16 capnp_enum_value) { + if (!ch_enum_is_superset) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Cannot convert CapnProto enum to ClickHouse enum: ClickHouse enum values/names is not a superset of CapnProto enum values/names"); + if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) + { assert_cast &>(column).insertValue(static_cast(capnp_enum_value)); + } else + { + auto it = capnp_to_ch_values.find(capnp_enum_value); + if (it == capnp_to_ch_values.end()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected value {} in CapnProto enum", capnp_enum_value); + assert_cast &>(column).insertValue(capnp_to_ch_values[capnp_enum_value]); + } } DataTypePtr data_type; capnp::EnumSchema enum_schema; const FormatSettings::CapnProtoEnumComparingMode enum_comparing_mode; + bool ch_enum_is_superset; + bool capnp_enum_is_superset; std::unordered_map ch_to_capnp_values; std::unordered_map capnp_to_ch_values; }; diff --git a/tests/queries/0_stateless/02030_capnp_format.reference b/tests/queries/0_stateless/02030_capnp_format.reference index 2b2307bfc6a..e08b1eb1271 100644 --- a/tests/queries/0_stateless/02030_capnp_format.reference +++ b/tests/queries/0_stateless/02030_capnp_format.reference @@ -12,6 +12,9 @@ \N [NULL,NULL,42] (NULL) 1 [1,NULL,2] (1) \N [NULL,NULL,42] (NULL) +OK +OK +OK one two tHrEe @@ -21,6 +24,14 @@ threE first second third +first +second +third +OK +one +two +tHrEe +OK OK OK OK diff --git a/tests/queries/0_stateless/02030_capnp_format.sh b/tests/queries/0_stateless/02030_capnp_format.sh index 625104fb590..b4484ca3766 100755 --- a/tests/queries/0_stateless/02030_capnp_format.sh +++ b/tests/queries/0_stateless/02030_capnp_format.sh @@ -71,16 +71,25 @@ $CLICKHOUSE_CLIENT --query="DROP TABLE capnp_nullable" $CLICKHOUSE_CLIENT --query="SELECT CAST(number, 'Enum(\'one\' = 0, \'two\' = 1, \'tHrEe\' = 2)') AS value FROM numbers(3) FORMAT CapnProto SETTINGS format_schema='$CLIENT_SCHEMADIR/02030_capnp_enum:Message'" > $CAPN_PROTO_FILE +$CLICKHOUSE_CLIENT --query="SELECT CAST(number % 2, 'Enum(\'one\' = 0, \'two\' = 1, \'tHrEe\' = 2, \'four\' = 4)') AS value FROM numbers(3) FORMAT CapnProto SETTINGS format_schema='$CLIENT_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="SELECT CAST(number % 2, 'Enum(\'one\' = 0, \'two\' = 1, \'tHrEe\' = 2, \'four\' = 4)') AS value FROM numbers(3) FORMAT CapnProto SETTINGS format_schema='$CLIENT_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names_case_insensitive'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="SELECT CAST(number % 2, 'Enum(\'one\' = 0, \'two\' = 1, \'tHrEe\' = 2, \'four\' = 4)') AS value FROM numbers(3) FORMAT CapnProto SETTINGS format_schema='$CLIENT_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_values'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL'; $CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'one\' = 1, \'two\' = 2, \'tHrEe\' = 3)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names'" $CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'oNe\' = 1, \'tWo\' = 2, \'threE\' = 3)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names_case_insensitive'" $CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'first\' = 0, \'second\' = 1, \'third\' = 2)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_values'" - +$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'first\' = 0, \'second\' = 1, \'third\' = 2)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_values'" $CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'one\' = 0, \'two\' = 1, \'three\' = 2)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'one\' = 0, \'two\' = 1, \'tHrEe\' = 2, \'four\' = 3)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'one\' = 0, \'two\' = 1, \'tHrEe\' = 2, \'four\' = 3)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names'" $CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'one\' = 1, \'two\' = 2, \'tHrEe\' = 3)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_values'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL'; $CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'first\' = 1, \'two\' = 2, \'three\' = 3)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names_case_insensitive'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="SELECT CAST(number % 2, 'Enum(\'one\' = 0, \'two\' = 1)') AS value FROM numbers(3) FORMAT CapnProto SETTINGS format_schema='$CLIENT_SCHEMADIR/02030_capnp_enum:Message'" > $CAPN_PROTO_FILE +$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'first\' = 0, \'two\' = 1)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_values'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'first\' = 0, \'two\' = 1)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'first\' = 0, \'two\' = 1)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names_case_insensitive'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL'; + + $CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS capnp_low_cardinality" $CLICKHOUSE_CLIENT --query="CREATE TABLE capnp_low_cardinality (lc1 LowCardinality(String), lc2 LowCardinality(Nullable(String)), lc3 Array(LowCardinality(Nullable(String)))) ENGINE=Memory" $CLICKHOUSE_CLIENT --query="INSERT INTO capnp_low_cardinality VALUES ('one', 'two', ['one', Null, 'two', Null]), ('two', Null, [Null])" diff --git a/tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.reference b/tests/queries/0_stateless/02735_capnp_case_insensitive_names_matching.reference similarity index 100% rename from tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.reference rename to tests/queries/0_stateless/02735_capnp_case_insensitive_names_matching.reference diff --git a/tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.sh b/tests/queries/0_stateless/02735_capnp_case_insensitive_names_matching.sh similarity index 100% rename from tests/queries/0_stateless/02735_capnp_case_insensitive_names_matcing.sh rename to tests/queries/0_stateless/02735_capnp_case_insensitive_names_matching.sh From 9b70836b6c2b7f90ed04c41f90ad9ba3473dbe59 Mon Sep 17 00:00:00 2001 From: Victor Krasnov Date: Fri, 9 Jun 2023 16:07:07 +0300 Subject: [PATCH 0720/1072] Add a syslog format example to the documentation --- .../functions/type-conversion-functions.md | 22 +++++++++++++++++++ .../functions/type-conversion-functions.md | 22 +++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index f1e2785285c..9c079bd9515 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1428,6 +1428,28 @@ Result: └──────┴─────────────────────────────────────┘ ``` +Query: + +``` sql +WITH + now() AS ts_now, + formatDateTime(ts_around, '%b %e %T') AS syslog_arg +SELECT + ts_now, + syslog_arg, + parseDateTimeBestEffort(syslog_arg) +FROM (SELECT arrayJoin([ts_now - 30, ts_now + 30]) AS ts_around); +``` + +Result: + +```response +┌──────────────ts_now─┬─syslog_arg──────┬─parseDateTimeBestEffort(syslog_arg)─┐ +│ 2023-06-09 16:04:30 │ Jun 9 16:04:00 │ 2023-06-09 16:04:00 │ +│ 2023-06-09 16:04:30 │ Jun 9 16:05:00 │ 2022-06-09 16:05:00 │ +└─────────────────────┴─────────────────┴─────────────────────────────────────┘ +``` + **See Also** - [RFC 1123](https://datatracker.ietf.org/doc/html/rfc1123) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index b763ee2b3ac..6e93a5e0acf 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -1107,6 +1107,28 @@ SELECT toYear(now()) as year, parseDateTimeBestEffort('10 20:19'); └──────┴─────────────────────────────────────┘ ``` +Запрос: + +``` sql +WITH + now() AS ts_now, + formatDateTime(ts_around, '%b %e %T') AS syslog_arg +SELECT + ts_now, + syslog_arg, + parseDateTimeBestEffort(syslog_arg) +FROM (SELECT arrayJoin([ts_now - 30, ts_now + 30]) AS ts_around); +``` + +Результат: + +``` text +┌──────────────ts_now─┬─syslog_arg──────┬─parseDateTimeBestEffort(syslog_arg)─┐ +│ 2023-06-09 16:04:30 │ Jun 9 16:04:00 │ 2023-06-09 16:04:00 │ +│ 2023-06-09 16:04:30 │ Jun 9 16:05:00 │ 2022-06-09 16:05:00 │ +└─────────────────────┴─────────────────┴─────────────────────────────────────┘ +``` + **Смотрите также** - [Информация о формате ISO 8601 от @xkcd](https://xkcd.com/1179/) From 8c4c82abd65753dcbe887c78baec2ab9c1f960d9 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 9 Jun 2023 15:11:07 +0200 Subject: [PATCH 0721/1072] Update lz4 to enable cutting-edge optimisations (#50621) --- contrib/lz4 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/lz4 b/contrib/lz4 index 4c9431e9af5..e82198428c8 160000 --- a/contrib/lz4 +++ b/contrib/lz4 @@ -1 +1 @@ -Subproject commit 4c9431e9af596af0556e5da0ae99305bafb2b10b +Subproject commit e82198428c8061372d5adef1f9bfff4203f6081e From 5af06f8c08a70298e217d3a8909a5ed8d412f474 Mon Sep 17 00:00:00 2001 From: Victor Krasnov Date: Fri, 9 Jun 2023 16:11:52 +0300 Subject: [PATCH 0722/1072] Amend the test infinitesimally --- .../0_stateless/02783_parseDateTimeBestEffort_syslog.reference | 2 +- .../0_stateless/02783_parseDateTimeBestEffort_syslog.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference index 63e7e367941..3ec93143e0e 100644 --- a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference +++ b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference @@ -1,5 +1,5 @@ parseDateTimeBestEffort - dt_ref res res_sam res_auc res_null res_null_sam res_null_auc res_zero res_zero_sam res_zero_auc res_us res_us_sam res_us_auc res_us_null res_us_null_sam res_us_null_auc res_us_zero res_us_zero_sam res_us_zero_auc res64 res64_sam res64_auc res64_null res64_null_sam res64_null_auc res64_zero res64_zero_sam res64_zero_auc res64_us res64_us_sam res64_us_auc res64_us_null res64_us_null_sam res64_us_null_auc res64_us_zero res64_us_zero_sam res64_us_zero_auc + around_June_7 res res_sam res_auc res_null res_null_sam res_null_auc res_zero res_zero_sam res_zero_auc res_us res_us_sam res_us_auc res_us_null res_us_null_sam res_us_null_auc res_us_zero res_us_zero_sam res_us_zero_auc res64 res64_sam res64_auc res64_null res64_null_sam res64_null_auc res64_zero res64_zero_sam res64_zero_auc res64_us res64_us_sam res64_us_auc res64_us_null res64_us_null_sam res64_us_null_auc res64_us_zero res64_us_zero_sam res64_us_zero_auc Jun 6 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 Jun 8 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 2022-06-08 00:00:00.000 diff --git a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql index 59211d3e6a0..52975cb5bbf 100644 --- a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql +++ b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql @@ -7,7 +7,7 @@ WITH dateDiff('second', toDateTime(ref_point), ts_now) AS impedimenta, formatDateTime(ts_around, '%b %e %T') AS dt_curr SELECT - formatDateTime(ts_around - impedimenta, '%b %e %H:%i:%s') AS dt_ref, + formatDateTime(ts_around - impedimenta, '%b %e %H:%i:%s') AS around_June_7, parseDateTimeBestEffort(dt_curr) - impedimenta AS res, parseDateTimeBestEffort(dt_curr, 'US/Samoa') - impedimenta AS res_sam, parseDateTimeBestEffort(dt_curr, 'Pacific/Auckland') - impedimenta AS res_auc, From ab0a01e4649d54bed7e3a5e1bc7ca22f97c9cec6 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Fri, 9 Jun 2023 09:14:50 -0400 Subject: [PATCH 0723/1072] close result block --- docs/en/operations/system-tables/asynchronous_insert_log.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/en/operations/system-tables/asynchronous_insert_log.md b/docs/en/operations/system-tables/asynchronous_insert_log.md index 8b0509d7000..c3aaa8e6c41 100644 --- a/docs/en/operations/system-tables/asynchronous_insert_log.md +++ b/docs/en/operations/system-tables/asynchronous_insert_log.md @@ -56,6 +56,7 @@ status: Ok flush_time: 2023-06-08 10:08:55 flush_time_microseconds: 2023-06-08 10:08:55.139676 flush_query_id: cd2c1e43-83f5-49dc-92e4-2fbc7f8d3716 +``` **See Also** From 05cab78dd288c839d0d5bcafda070f8397c1fc53 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 9 Jun 2023 15:15:41 +0200 Subject: [PATCH 0724/1072] Fix bug in `uniqExact` parallel merging (#50590) * impl * disable test under sanitizers --- src/AggregateFunctions/UniqExactSet.h | 51 +++++++++++-------- ..._uniq_exact_parallel_merging_bug.reference | 0 .../02782_uniq_exact_parallel_merging_bug.sh | 21 ++++++++ 3 files changed, 51 insertions(+), 21 deletions(-) create mode 100644 tests/queries/0_stateless/02782_uniq_exact_parallel_merging_bug.reference create mode 100755 tests/queries/0_stateless/02782_uniq_exact_parallel_merging_bug.sh diff --git a/src/AggregateFunctions/UniqExactSet.h b/src/AggregateFunctions/UniqExactSet.h index 916dfe4a424..90cfe700179 100644 --- a/src/AggregateFunctions/UniqExactSet.h +++ b/src/AggregateFunctions/UniqExactSet.h @@ -1,10 +1,11 @@ #pragma once +#include #include #include #include -#include #include +#include namespace DB @@ -48,30 +49,38 @@ public: } else { - auto next_bucket_to_merge = std::make_shared(0); - - auto thread_func = [&lhs, &rhs, next_bucket_to_merge, thread_group = CurrentThread::getGroup()]() + try { - SCOPE_EXIT_SAFE( - if (thread_group) - CurrentThread::detachFromGroupIfNotDetached(); - ); - if (thread_group) - CurrentThread::attachToGroupIfDetached(thread_group); - setThreadName("UniqExactMerger"); + auto next_bucket_to_merge = std::make_shared(0); - while (true) + auto thread_func = [&lhs, &rhs, next_bucket_to_merge, thread_group = CurrentThread::getGroup()]() { - const auto bucket = next_bucket_to_merge->fetch_add(1); - if (bucket >= rhs.NUM_BUCKETS) - return; - lhs.impls[bucket].merge(rhs.impls[bucket]); - } - }; + SCOPE_EXIT_SAFE( + if (thread_group) + CurrentThread::detachFromGroupIfNotDetached(); + ); + if (thread_group) + CurrentThread::attachToGroupIfDetached(thread_group); + setThreadName("UniqExactMerger"); - for (size_t i = 0; i < std::min(thread_pool->getMaxThreads(), rhs.NUM_BUCKETS); ++i) - thread_pool->scheduleOrThrowOnError(thread_func); - thread_pool->wait(); + while (true) + { + const auto bucket = next_bucket_to_merge->fetch_add(1); + if (bucket >= rhs.NUM_BUCKETS) + return; + lhs.impls[bucket].merge(rhs.impls[bucket]); + } + }; + + for (size_t i = 0; i < std::min(thread_pool->getMaxThreads(), rhs.NUM_BUCKETS); ++i) + thread_pool->scheduleOrThrowOnError(thread_func); + thread_pool->wait(); + } + catch (...) + { + thread_pool->wait(); + throw; + } } } } diff --git a/tests/queries/0_stateless/02782_uniq_exact_parallel_merging_bug.reference b/tests/queries/0_stateless/02782_uniq_exact_parallel_merging_bug.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02782_uniq_exact_parallel_merging_bug.sh b/tests/queries/0_stateless/02782_uniq_exact_parallel_merging_bug.sh new file mode 100755 index 00000000000..d84ffd21b87 --- /dev/null +++ b/tests/queries/0_stateless/02782_uniq_exact_parallel_merging_bug.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Tags: long, no-random-settings, no-tsan, no-asan, no-ubsan, no-msan + +# shellcheck disable=SC2154 + +unset CLICKHOUSE_LOG_COMMENT + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +clickhouse-client -q " + CREATE TABLE ${CLICKHOUSE_DATABASE}.t(s String) + ENGINE = MergeTree + ORDER BY tuple(); +" + +clickhouse-client -q "insert into ${CLICKHOUSE_DATABASE}.t select number%10==0 ? toString(number) : '' from numbers_mt(1e7)" + +clickhouse-benchmark -q "select count(distinct s) from ${CLICKHOUSE_DATABASE}.t settings max_memory_usage = '50Mi'" --ignore-error -c 16 -i 1000 2>/dev/null From f8791a0ea393120dbfba8eec8627edbc8d00deb8 Mon Sep 17 00:00:00 2001 From: Jordi Villar Date: Fri, 9 Jun 2023 15:36:48 +0200 Subject: [PATCH 0725/1072] SummingMergeTree support for DateTime64 --- src/DataTypes/DataTypeDateTime64.h | 2 ++ .../02785_summing_merge_tree_datetime64.reference | 1 + .../02785_summing_merge_tree_datetime64.sql | 12 ++++++++++++ 3 files changed, 15 insertions(+) create mode 100644 tests/queries/0_stateless/02785_summing_merge_tree_datetime64.reference create mode 100644 tests/queries/0_stateless/02785_summing_merge_tree_datetime64.sql diff --git a/src/DataTypes/DataTypeDateTime64.h b/src/DataTypes/DataTypeDateTime64.h index aaa99485040..64cedd798d1 100644 --- a/src/DataTypes/DataTypeDateTime64.h +++ b/src/DataTypes/DataTypeDateTime64.h @@ -37,6 +37,8 @@ public: bool canBeUsedAsVersion() const override { return true; } + bool isSummable() const override { return false; } + protected: SerializationPtr doGetDefaultSerialization() const override; }; diff --git a/tests/queries/0_stateless/02785_summing_merge_tree_datetime64.reference b/tests/queries/0_stateless/02785_summing_merge_tree_datetime64.reference new file mode 100644 index 00000000000..d395c4d6a0f --- /dev/null +++ b/tests/queries/0_stateless/02785_summing_merge_tree_datetime64.reference @@ -0,0 +1 @@ +1 2023-05-01 23:55:55.100 15 diff --git a/tests/queries/0_stateless/02785_summing_merge_tree_datetime64.sql b/tests/queries/0_stateless/02785_summing_merge_tree_datetime64.sql new file mode 100644 index 00000000000..1ed930ebbc7 --- /dev/null +++ b/tests/queries/0_stateless/02785_summing_merge_tree_datetime64.sql @@ -0,0 +1,12 @@ +DROP TABLE IF EXISTS summing_merge_tree_datetime64; + +CREATE TABLE summing_merge_tree_datetime64 ( `pk` UInt64, `timestamp` DateTime64(3), `value` UInt64 ) +ENGINE = SummingMergeTree() ORDER BY pk; + +INSERT INTO summing_merge_tree_datetime64 SELECT 1 pk, '2023-05-01 23:55:55.100' timestamp, 1 value; +INSERT INTO summing_merge_tree_datetime64 SELECT 1 pk, '2023-05-01 23:55:55.100' timestamp, 2 value; +INSERT INTO summing_merge_tree_datetime64 SELECT 1 pk, '2023-05-01 23:55:55.100' timestamp, 3 value; +INSERT INTO summing_merge_tree_datetime64 SELECT 1 pk, '2023-05-01 23:55:55.100' timestamp, 4 value; +INSERT INTO summing_merge_tree_datetime64 SELECT 1 pk, '2023-05-01 23:55:55.100' timestamp, 5 value; + +SELECT * FROM summing_merge_tree_datetime64 FINAL; From af153399bf19d9dbb22253d6224fca37616401e3 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Fri, 9 Jun 2023 15:26:13 +0000 Subject: [PATCH 0726/1072] Update version_date.tsv and changelogs after v23.2.7.32-stable --- docs/changelogs/v23.2.7.32-stable.md | 35 ++++++++++++++++++++++++++++ utils/list-versions/version_date.tsv | 1 + 2 files changed, 36 insertions(+) create mode 100644 docs/changelogs/v23.2.7.32-stable.md diff --git a/docs/changelogs/v23.2.7.32-stable.md b/docs/changelogs/v23.2.7.32-stable.md new file mode 100644 index 00000000000..db5e9e76311 --- /dev/null +++ b/docs/changelogs/v23.2.7.32-stable.md @@ -0,0 +1,35 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.2.7.32-stable (934f6a2aa0e) FIXME as compared to v23.2.6.34-stable (570190045b0) + +#### Performance Improvement +* Backported in [#49218](https://github.com/ClickHouse/ClickHouse/issues/49218): Fixed excessive reading in queries with `FINAL`. [#47801](https://github.com/ClickHouse/ClickHouse/pull/47801) ([Nikita Taranov](https://github.com/nickitat)). + +#### Build/Testing/Packaging Improvement +* Backported in [#49208](https://github.com/ClickHouse/ClickHouse/issues/49208): Fix glibc compatibility check: replace `preadv` from musl. [#49144](https://github.com/ClickHouse/ClickHouse/pull/49144) ([alesapin](https://github.com/alesapin)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix key not found error for queries with multiple StorageJoin [#49137](https://github.com/ClickHouse/ClickHouse/pull/49137) ([vdimir](https://github.com/vdimir)). +* Fix race on Outdated parts loading [#49223](https://github.com/ClickHouse/ClickHouse/pull/49223) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix bug in DISTINCT [#49628](https://github.com/ClickHouse/ClickHouse/pull/49628) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix msan issue in randomStringUTF8() [#49750](https://github.com/ClickHouse/ClickHouse/pull/49750) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix IPv6 encoding in protobuf [#49933](https://github.com/ClickHouse/ClickHouse/pull/49933) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Avoid deadlock when starting table in attach thread of `ReplicatedMergeTree` [#50026](https://github.com/ClickHouse/ClickHouse/pull/50026) ([Antonio Andelic](https://github.com/antonio2368)). +* JIT compilation not equals NaN fix [#50056](https://github.com/ClickHouse/ClickHouse/pull/50056) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix crash with `multiIf` and constant condition and nullable arguments [#50123](https://github.com/ClickHouse/ClickHouse/pull/50123) ([Anton Popov](https://github.com/CurtizJ)). +* Fix Keeper deadlock on exception when preprocessing requests. [#50387](https://github.com/ClickHouse/ClickHouse/pull/50387) ([frinkr](https://github.com/frinkr)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Improve test reports [#49151](https://github.com/ClickHouse/ClickHouse/pull/49151) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fallback auth gh api [#49314](https://github.com/ClickHouse/ClickHouse/pull/49314) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Improve CI: status commit, auth for get_gh_api [#49388](https://github.com/ClickHouse/ClickHouse/pull/49388) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Update github.com/distribution/distribution [#50114](https://github.com/ClickHouse/ClickHouse/pull/50114) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Catch issues with dockerd during the build [#50700](https://github.com/ClickHouse/ClickHouse/pull/50700) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 9704c68be54..2aeeb5db35c 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -3,6 +3,7 @@ v23.4.2.11-stable 2023-05-02 v23.4.1.1943-stable 2023-04-27 v23.3.2.37-lts 2023-04-22 v23.3.1.2823-lts 2023-03-31 +v23.2.7.32-stable 2023-06-09 v23.2.6.34-stable 2023-04-23 v23.2.5.46-stable 2023-04-03 v23.2.4.12-stable 2023-03-10 From f8dc408ccbb600bd0c387feeedf899448501b8b1 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 9 Jun 2023 16:23:22 +0000 Subject: [PATCH 0727/1072] Desctructing --> Destructing --- src/Interpreters/Context.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 8fb06e21d22..995e78d8f0b 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -377,7 +377,7 @@ struct ContextSharedPart : boost::noncopyable { try { - LOG_DEBUG(log, "Desctructing remote fs threadpool reader"); + LOG_DEBUG(log, "Destructing remote fs threadpool reader"); asynchronous_remote_fs_reader->wait(); asynchronous_remote_fs_reader.reset(); } @@ -391,7 +391,7 @@ struct ContextSharedPart : boost::noncopyable { try { - LOG_DEBUG(log, "Desctructing local fs threadpool reader"); + LOG_DEBUG(log, "Destructing local fs threadpool reader"); asynchronous_local_fs_reader->wait(); asynchronous_local_fs_reader.reset(); } @@ -405,7 +405,7 @@ struct ContextSharedPart : boost::noncopyable { try { - LOG_DEBUG(log, "Desctructing local fs threadpool reader"); + LOG_DEBUG(log, "Destructing local fs threadpool reader"); synchronous_local_fs_reader->wait(); synchronous_local_fs_reader.reset(); } @@ -419,7 +419,7 @@ struct ContextSharedPart : boost::noncopyable { try { - LOG_DEBUG(log, "Desctructing threadpool writer"); + LOG_DEBUG(log, "Destructing threadpool writer"); threadpool_writer->wait(); threadpool_writer.reset(); } @@ -433,7 +433,7 @@ struct ContextSharedPart : boost::noncopyable { try { - LOG_DEBUG(log, "Desctructing marks loader"); + LOG_DEBUG(log, "Destructing marks loader"); load_marks_threadpool->wait(); load_marks_threadpool.reset(); } @@ -447,7 +447,7 @@ struct ContextSharedPart : boost::noncopyable { try { - LOG_DEBUG(log, "Desctructing prefetch threadpool"); + LOG_DEBUG(log, "Destructing prefetch threadpool"); prefetch_threadpool->wait(); prefetch_threadpool.reset(); } From 13798f8b07f2550ec73323ce8596a276155aa367 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 9 Jun 2023 19:52:49 +0300 Subject: [PATCH 0728/1072] Update MergeTreeData.cpp --- src/Storages/MergeTree/MergeTreeData.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 9cca471fddb..23351423d49 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1252,6 +1252,10 @@ MergeTreeData::LoadPartResult MergeTreeData::loadDataPart( mark_broken(); return res; } + catch (const Poco::TimeoutException &) + { + throw; + } catch (...) { mark_broken(); From a8b579a85618f57f0fd6316d16d28677dfeb0d8b Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 9 Jun 2023 19:28:06 +0200 Subject: [PATCH 0729/1072] Rename azure_blob_storage to azureBlobStorage --- .../table-functions/azure_blob_storage.md | 2 +- .../TableFunctionAzureBlobStorage.cpp | 2 +- .../TableFunctionAzureBlobStorage.h | 2 +- .../test_storage_azure_blob_storage/test.py | 22 +++++++++---------- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/en/sql-reference/table-functions/azure_blob_storage.md b/docs/en/sql-reference/table-functions/azure_blob_storage.md index f86307b3b85..6091aab5f9d 100644 --- a/docs/en/sql-reference/table-functions/azure_blob_storage.md +++ b/docs/en/sql-reference/table-functions/azure_blob_storage.md @@ -5,7 +5,7 @@ sidebar_label: azure_blob_storage keywords: [azure blob storage] --- -# azure\_blob\_storage Table Function +# azureBlobStorage Table Function Provides a table-like interface to select/insert files in [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs). This table function is similar to the [s3 function](../../sql-reference/table-functions/s3.md). diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp index 38d9362894a..d2a96173491 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp @@ -246,7 +246,7 @@ void registerTableFunctionAzureBlobStorage(TableFunctionFactory & factory) factory.registerFunction( {.documentation = {.description=R"(The table function can be used to read the data stored on Azure Blob Storage.)", - .examples{{"azure_blob_storage", "SELECT * FROM azure_blob_storage(connection, container, blob_path, format, structure)", ""}}}, + .examples{{"azureBlobStorage", "SELECT * FROM azureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])", ""}}}, .allow_readonly = false}); } diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.h b/src/TableFunctions/TableFunctionAzureBlobStorage.h index 0bb872de3f3..0ac3f9771c7 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.h +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.h @@ -18,7 +18,7 @@ class Context; class TableFunctionAzureBlobStorage : public ITableFunction { public: - static constexpr auto name = "azure_blob_storage"; + static constexpr auto name = "azureBlobStorage"; static constexpr auto signature = "- connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure]\n"; static size_t getMaxNumberOfArguments() { return 8; } diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index f0934d3aa80..f9d337b6d86 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -466,7 +466,7 @@ def test_simple_write_account_string_table_function(cluster): node = cluster.instances["node"] azure_query( node, - "INSERT INTO TABLE FUNCTION azure_blob_storage('http://azurite1:10000/devstoreaccount1', 'cont', 'test_simple_write_tf.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')", + "INSERT INTO TABLE FUNCTION azureBlobStorage('http://azurite1:10000/devstoreaccount1', 'cont', 'test_simple_write_tf.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')", ) print(get_azure_file_content("test_simple_write_tf.csv")) assert get_azure_file_content("test_simple_write_tf.csv") == '1,"a"\n' @@ -476,7 +476,7 @@ def test_simple_write_connection_string_table_function(cluster): node = cluster.instances["node"] azure_query( node, - "INSERT INTO TABLE FUNCTION azure_blob_storage('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1;', 'cont', 'test_simple_write_connection_tf.csv', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')", + "INSERT INTO TABLE FUNCTION azureBlobStorage('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1;', 'cont', 'test_simple_write_connection_tf.csv', 'CSV', 'auto', 'key UInt64, data String') VALUES (1, 'a')", ) print(get_azure_file_content("test_simple_write_connection_tf.csv")) assert get_azure_file_content("test_simple_write_connection_tf.csv") == '1,"a"\n' @@ -486,7 +486,7 @@ def test_simple_write_named_collection_1_table_function(cluster): node = cluster.instances["node"] azure_query( node, - "INSERT INTO TABLE FUNCTION azure_blob_storage(azure_conf1) VALUES (1, 'a')", + "INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf1) VALUES (1, 'a')", ) print(get_azure_file_content("test_simple_write_named.csv")) assert get_azure_file_content("test_simple_write_named.csv") == '1,"a"\n' @@ -507,7 +507,7 @@ def test_simple_write_named_collection_2_table_function(cluster): azure_query( node, - "INSERT INTO TABLE FUNCTION azure_blob_storage(azure_conf2, container='cont', blob_path='test_simple_write_named_2_tf.csv', format='CSV', structure='key UInt64, data String') VALUES (1, 'a')", + "INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, container='cont', blob_path='test_simple_write_named_2_tf.csv', format='CSV', structure='key UInt64, data String') VALUES (1, 'a')", ) print(get_azure_file_content("test_simple_write_named_2_tf.csv")) assert get_azure_file_content("test_simple_write_named_2_tf.csv") == '1,"a"\n' @@ -529,9 +529,9 @@ def test_put_get_with_globs_tf(cluster): azure_query( node, - f"INSERT INTO TABLE FUNCTION azure_blob_storage(azure_conf2, container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values}", + f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values}", ) - query = f"select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from azure_blob_storage(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv', format='CSV', structure='{table_format}')" + query = f"select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from azureBlobStorage(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv', format='CSV', structure='{table_format}')" assert azure_query(node, query).splitlines() == [ "450\t450\t900\t0.csv\t{bucket}/{max_path}".format( bucket="cont", max_path=max_path @@ -543,10 +543,10 @@ def test_schema_inference_no_globs_tf(cluster): node = cluster.instances["node"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 String, column3 UInt32" - query = f"insert into table function azure_blob_storage(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs_tf.csv', format='CSVWithNames', structure='{table_format}') SELECT number, toString(number), number * number FROM numbers(1000)" + query = f"insert into table function azureBlobStorage(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs_tf.csv', format='CSVWithNames', structure='{table_format}') SELECT number, toString(number), number * number FROM numbers(1000)" azure_query(node, query) - query = "select sum(column1), sum(length(column2)), sum(column3), min(_file), max(_path) from azure_blob_storage(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs_tf.csv')" + query = "select sum(column1), sum(length(column2)), sum(column3), min(_file), max(_path) from azureBlobStorage(azure_conf2, container='cont', blob_path='test_schema_inference_no_globs_tf.csv')" assert azure_query(node, query).splitlines() == [ "499500\t2890\t332833500\ttest_schema_inference_no_globs_tf.csv\tcont/test_schema_inference_no_globs_tf.csv" ] @@ -566,10 +566,10 @@ def test_schema_inference_from_globs_tf(cluster): max_path = max(path, max_path) values = f"({i},{j},{i + j})" - query = f"insert into table function azure_blob_storage(azure_conf2, container='cont', blob_path='{path}', format='CSVWithNames', structure='{table_format}') VALUES {values}" + query = f"insert into table function azureBlobStorage(azure_conf2, container='cont', blob_path='{path}', format='CSVWithNames', structure='{table_format}') VALUES {values}" azure_query(node, query) - query = f"select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from azure_blob_storage(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv')" + query = f"select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from azureBlobStorage(azure_conf2, container='cont', blob_path='{unique_prefix}/*_{{a,b,c,d}}/?.csv')" assert azure_query(node, query).splitlines() == [ "450\t450\t900\t0.csv\t{bucket}/{max_path}".format( bucket="cont", max_path=max_path @@ -586,7 +586,7 @@ def test_partition_by_tf(cluster): azure_query( node, - f"INSERT INTO TABLE FUNCTION azure_blob_storage('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', '{table_format}') PARTITION BY {partition_by} VALUES {values}", + f"INSERT INTO TABLE FUNCTION azureBlobStorage('http://azurite1:10000/devstoreaccount1', 'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', '{table_format}') PARTITION BY {partition_by} VALUES {values}", ) assert "1,2,3\n" == get_azure_file_content("test_tf_3.csv") From e662fa01d0a1455aa2a625fd342ef8e7e998f34f Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 9 Jun 2023 20:21:57 +0200 Subject: [PATCH 0730/1072] Added azureBlobStorage to aspell-dict.txt --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index d6cef1883f4..a01b67b26b1 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1081,6 +1081,7 @@ avgweighted avro avx aws +azureBlobStorage backend backoff backticks From f0d4ce4770a00a8c0cd9857a485fc8bbc124a95b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 9 Jun 2023 22:05:21 +0300 Subject: [PATCH 0731/1072] Update 02785_summing_merge_tree_datetime64.sql --- .../queries/0_stateless/02785_summing_merge_tree_datetime64.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02785_summing_merge_tree_datetime64.sql b/tests/queries/0_stateless/02785_summing_merge_tree_datetime64.sql index 1ed930ebbc7..db00f189330 100644 --- a/tests/queries/0_stateless/02785_summing_merge_tree_datetime64.sql +++ b/tests/queries/0_stateless/02785_summing_merge_tree_datetime64.sql @@ -10,3 +10,4 @@ INSERT INTO summing_merge_tree_datetime64 SELECT 1 pk, '2023-05-01 23:55:55.100' INSERT INTO summing_merge_tree_datetime64 SELECT 1 pk, '2023-05-01 23:55:55.100' timestamp, 5 value; SELECT * FROM summing_merge_tree_datetime64 FINAL; +DROP TABLE summing_merge_tree_datetime64; From c538506f2e3ba0716dcc2f13f63bb4edc1f6f33e Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 9 Jun 2023 20:50:17 +0000 Subject: [PATCH 0732/1072] More fixes --- docs/en/sql-reference/statements/create/table.md | 4 ++++ src/Compression/CompressionCodecDeflateQpl.h | 3 +-- src/Compression/CompressionFactoryAdditions.cpp | 2 +- src/Compression/ICompressionCodec.h | 4 ++-- src/Storages/Distributed/DistributedSink.cpp | 2 +- 5 files changed, 9 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index 496ecdbda7b..1a72f89fb1f 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -385,6 +385,10 @@ High compression levels are useful for asymmetric scenarios, like compress once, - DEFLATE_QPL works best if the system has a Intel® IAA (In-Memory Analytics Accelerator) offloading device. Refer to [Accelerator Configuration](https://intel.github.io/qpl/documentation/get_started_docs/installation.html#accelerator-configuration) and [Benchmark with DEFLATE_QPL](/docs/en/development/building_and_benchmarking_deflate_qpl.md/#Run-Benchmark-with-DEFLATE_QPL) for more details. - DEFLATE_QPL-compressed data can only be transferred between ClickHouse nodes compiled with SSE 4.2 enabled. +:::note +DEFLATE_QPL is not available in ClickHouse Cloud. +::: + ### Specialized Codecs These codecs are designed to make compression more effective by using specific features of data. Some of these codecs do not compress data themself. Instead, they prepare the data for a common purpose codec, which compresses it better than without this preparation. diff --git a/src/Compression/CompressionCodecDeflateQpl.h b/src/Compression/CompressionCodecDeflateQpl.h index 13aa8733b54..8d73568707e 100644 --- a/src/Compression/CompressionCodecDeflateQpl.h +++ b/src/Compression/CompressionCodecDeflateQpl.h @@ -98,8 +98,7 @@ public: protected: bool isCompression() const override { return true; } bool isGenericCompression() const override { return true; } - bool isExperimental() const override { return false; } - bool isDeflateQplCompression() const override { return true; } + bool isDeflateQpl() const override { return true; } UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override; void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override; diff --git a/src/Compression/CompressionFactoryAdditions.cpp b/src/Compression/CompressionFactoryAdditions.cpp index 46f7e2653c2..98e9e7480da 100644 --- a/src/Compression/CompressionFactoryAdditions.cpp +++ b/src/Compression/CompressionFactoryAdditions.cpp @@ -159,7 +159,7 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST( " You can enable it with the 'allow_experimental_codecs' setting.", codec_family_name); - if (!enable_deflate_qpl_codec && result_codec->isDeflateQplCompression()) + if (!enable_deflate_qpl_codec && result_codec->isDeflateQpl()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Codec {} is disabled by default." " You can enable it with the 'enable_deflate_qpl_codec' setting.", diff --git a/src/Compression/ICompressionCodec.h b/src/Compression/ICompressionCodec.h index f7e8f4e43d2..6630838fa64 100644 --- a/src/Compression/ICompressionCodec.h +++ b/src/Compression/ICompressionCodec.h @@ -109,8 +109,8 @@ public: /// It will not be allowed to use unless the user will turn off the safety switch. virtual bool isExperimental() const { return false; } - /// This is a knob for Deflate QPL codec. - virtual bool isDeflateQplCompression() const { return false; } + /// Is this the DEFLATE_QPL codec? + virtual bool isDeflateQpl() const { return false; } /// If it does nothing. virtual bool isNone() const { return false; } diff --git a/src/Storages/Distributed/DistributedSink.cpp b/src/Storages/Distributed/DistributedSink.cpp index 1e1c911920e..875764f7633 100644 --- a/src/Storages/Distributed/DistributedSink.cpp +++ b/src/Storages/Distributed/DistributedSink.cpp @@ -733,7 +733,7 @@ void DistributedSink::writeToShard(const Cluster::ShardInfo & shard_info, const if (compression_method == "ZSTD") compression_level = settings.network_zstd_compression_level; - CompressionCodecFactory::instance().validateCodec(compression_method, compression_level, !settings.allow_suspicious_codecs, settings.allow_experimental_codecs, settings.enale_deflate_qpl_codec); + CompressionCodecFactory::instance().validateCodec(compression_method, compression_level, !settings.allow_suspicious_codecs, settings.allow_experimental_codecs, settings.enable_deflate_qpl_codec); CompressionCodecPtr compression_codec = CompressionCodecFactory::instance().get(compression_method, compression_level); /// tmp directory is used to ensure atomicity of transactions From c71edb6c798163ad50a077a19a3bf74eb57ba212 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Fri, 9 Jun 2023 17:29:42 +0000 Subject: [PATCH 0733/1072] Fix style --- src/Processors/Sources/MongoDBSource.cpp | 10 +++++----- src/Processors/Sources/MongoDBSource.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Processors/Sources/MongoDBSource.cpp b/src/Processors/Sources/MongoDBSource.cpp index 74dfa13158c..cd4db416a29 100644 --- a/src/Processors/Sources/MongoDBSource.cpp +++ b/src/Processors/Sources/MongoDBSource.cpp @@ -371,8 +371,8 @@ bool isMongoDBWireProtocolOld(Poco::MongoDB::Connection & connection_) { Poco::MongoDB::Database db("config"); Poco::MongoDB::Document::Ptr doc = db.queryServerHello(connection_); - auto _wireVersion = doc->getInteger("maxWireVersion"); - return _wireVersion < Poco::MongoDB::Database::WireVersion::VER_36; + auto wire_version = doc->getInteger("maxWireVersion"); + return wire_version < Poco::MongoDB::Database::WireVersion::VER_36; } @@ -413,20 +413,20 @@ Poco::MongoDB::Document::Vector MongoDBCursor::nextDocuments(Poco::MongoDB::Conn if (is_wire_protocol_old) { auto response = old_cursor->next(connection); - cursorID_ = response.cursorID(); + cursor_id = response.cursorID(); return std::move(response.documents()); } else { auto response = new_cursor->next(connection); - cursorID_ = new_cursor->cursorID(); + cursor_id = new_cursor->cursorID(); return std::move(response.documents()); } } Int64 MongoDBCursor::cursorID() const { - return cursorID_; + return cursor_id; } diff --git a/src/Processors/Sources/MongoDBSource.h b/src/Processors/Sources/MongoDBSource.h index 2bc5481e20b..0e95d42c028 100644 --- a/src/Processors/Sources/MongoDBSource.h +++ b/src/Processors/Sources/MongoDBSource.h @@ -53,7 +53,7 @@ private: const bool is_wire_protocol_old; std::unique_ptr old_cursor; std::unique_ptr new_cursor; - Int64 cursorID_ = 0; + Int64 cursor_id = 0; }; /// Converts MongoDB Cursor to a stream of Blocks From 96d7b2efc9c0d4f40b919c5036fcfbe7445d10a1 Mon Sep 17 00:00:00 2001 From: tpanetti Date: Fri, 9 Jun 2023 13:50:30 -0700 Subject: [PATCH 0734/1072] Disable fasttest for MySQL Compatibility Type Conversion and refactor style for DataTypeNumberBase --- src/DataTypes/DataTypeNumberBase.cpp | 22 ------------------- .../02775_show_columns_mysql_compatibility.sh | 1 + 2 files changed, 1 insertion(+), 22 deletions(-) diff --git a/src/DataTypes/DataTypeNumberBase.cpp b/src/DataTypes/DataTypeNumberBase.cpp index e4c0fb96483..4cefc4945c6 100644 --- a/src/DataTypes/DataTypeNumberBase.cpp +++ b/src/DataTypes/DataTypeNumberBase.cpp @@ -15,50 +15,28 @@ template String DataTypeNumberBase::getSQLCompatibleName() const { if constexpr (std::is_same_v) - { return "TINYINT"; - } else if constexpr (std::is_same_v) - { return "SMALLINT"; - } else if constexpr (std::is_same_v) - { return "INTEGER"; - } else if constexpr (std::is_same_v) - { return "BIGINT"; - } else if constexpr (std::is_same_v) - { return "TINYINT UNSIGNED"; - } else if constexpr (std::is_same_v) - { return "SMALLINT UNSIGNED"; - } else if constexpr (std::is_same_v) - { return "INTEGER UNSIGNED"; - } else if constexpr (std::is_same_v) - { return "BIGINT UNSIGNED"; - } else if constexpr (std::is_same_v) - { return "FLOAT"; - } else if constexpr (std::is_same_v) - { return "DOUBLE"; - } /// Unsupported types are converted to TEXT else - { return "TEXT"; - } } template diff --git a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh index 938102cb5fc..6a546c47a38 100755 --- a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh +++ b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: no-fasttest CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From 41c34aaf5ef747c98fb98fdb0c08c17bcb35bc78 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Mon, 22 May 2023 19:54:08 +0800 Subject: [PATCH 0735/1072] optimize parquet write performance for parallel threads fix CI fix review comments and CI --- .../Formats/Impl/ArrowBlockOutputFormat.cpp | 4 +- .../Formats/Impl/CHColumnToArrowColumn.cpp | 99 ++++++---- .../Formats/Impl/CHColumnToArrowColumn.h | 2 +- .../Formats/Impl/ParquetBlockOutputFormat.cpp | 179 +++++++++--------- .../Formats/Impl/ParquetBlockOutputFormat.h | 9 +- 5 files changed, 154 insertions(+), 139 deletions(-) diff --git a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp index 1e72c949b09..8bd1cf3897d 100644 --- a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp @@ -58,7 +58,9 @@ void ArrowBlockOutputFormat::consume(Chunk chunk) format_settings.arrow.output_fixed_string_as_fixed_byte_array); } - ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num); + auto chunks = std::vector(); + chunks.push_back(std::move(chunk)); + ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunks, columns_num); if (!writer) prepareWriter(arrow_table->schema()); diff --git a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp index 19b2dcccf64..c3685e813d3 100644 --- a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp +++ b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp @@ -976,56 +976,75 @@ namespace DB void CHColumnToArrowColumn::chChunkToArrowTable( std::shared_ptr & res, - const Chunk & chunk, + const std::vector & chunks, size_t columns_num) { - /// For arrow::Schema and arrow::Table creation - std::vector> arrow_arrays; - arrow_arrays.reserve(columns_num); - for (size_t column_i = 0; column_i < columns_num; ++column_i) + std::shared_ptr arrow_schema; + std::vector table_data(columns_num); + + for (const auto & chunk : chunks) { - const ColumnWithTypeAndName & header_column = header_columns[column_i]; - auto column = chunk.getColumns()[column_i]; - - if (!low_cardinality_as_dictionary) - column = recursiveRemoveLowCardinality(column); - - if (!is_arrow_fields_initialized) + /// For arrow::Schema and arrow::Table creation + for (size_t column_i = 0; column_i < columns_num; ++column_i) { - bool is_column_nullable = false; - auto arrow_type = getArrowType(header_column.type, column, header_column.name, format_name, output_string_as_string, output_fixed_string_as_fixed_byte_array, &is_column_nullable); - arrow_fields.emplace_back(std::make_shared(header_column.name, arrow_type, is_column_nullable)); + const ColumnWithTypeAndName & header_column = header_columns[column_i]; + auto column = chunk.getColumns()[column_i]; + + if (!low_cardinality_as_dictionary) + column = recursiveRemoveLowCardinality(column); + + if (!is_arrow_fields_initialized) + { + bool is_column_nullable = false; + auto arrow_type = getArrowType( + header_column.type, + column, + header_column.name, + format_name, + output_string_as_string, + output_fixed_string_as_fixed_byte_array, + &is_column_nullable); + arrow_fields.emplace_back(std::make_shared(header_column.name, arrow_type, is_column_nullable)); + } + + arrow::MemoryPool * pool = arrow::default_memory_pool(); + std::unique_ptr array_builder; + arrow::Status status = MakeBuilder(pool, arrow_fields[column_i]->type(), &array_builder); + checkStatus(status, column->getName(), format_name); + + fillArrowArray( + header_column.name, + column, + header_column.type, + nullptr, + array_builder.get(), + format_name, + 0, + column->size(), + output_string_as_string, + output_fixed_string_as_fixed_byte_array, + dictionary_values); + + std::shared_ptr arrow_array; + status = array_builder->Finish(&arrow_array); + checkStatus(status, column->getName(), format_name); + + table_data.at(column_i).emplace_back(std::move(arrow_array)); } - arrow::MemoryPool* pool = arrow::default_memory_pool(); - std::unique_ptr array_builder; - arrow::Status status = MakeBuilder(pool, arrow_fields[column_i]->type(), &array_builder); - checkStatus(status, column->getName(), format_name); - - fillArrowArray( - header_column.name, - column, - header_column.type, - nullptr, - array_builder.get(), - format_name, - 0, - column->size(), - output_string_as_string, - output_fixed_string_as_fixed_byte_array, - dictionary_values); - - std::shared_ptr arrow_array; - status = array_builder->Finish(&arrow_array); - checkStatus(status, column->getName(), format_name); - arrow_arrays.emplace_back(std::move(arrow_array)); + is_arrow_fields_initialized = true; + if (!arrow_schema) + arrow_schema = std::make_shared(arrow_fields); } - std::shared_ptr arrow_schema = std::make_shared(arrow_fields); + std::vector> columns; + columns.reserve(columns_num); + for (size_t column_i = 0; column_i < columns_num; ++column_i) + columns.emplace_back(std::make_shared(table_data.at(column_i))); - res = arrow::Table::Make(arrow_schema, arrow_arrays); - is_arrow_fields_initialized = true; + res = arrow::Table::Make(arrow_schema, columns); } + } #endif diff --git a/src/Processors/Formats/Impl/CHColumnToArrowColumn.h b/src/Processors/Formats/Impl/CHColumnToArrowColumn.h index 3649d0eed9b..02766e299a0 100644 --- a/src/Processors/Formats/Impl/CHColumnToArrowColumn.h +++ b/src/Processors/Formats/Impl/CHColumnToArrowColumn.h @@ -16,7 +16,7 @@ class CHColumnToArrowColumn public: CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_, bool output_string_as_string_, bool output_fixed_string_as_fixed_byte_array_); - void chChunkToArrowTable(std::shared_ptr & res, const Chunk & chunk, size_t columns_num); + void chChunkToArrowTable(std::shared_ptr & res, const std::vector & chunk, size_t columns_num); private: ColumnsWithTypeAndName header_columns; diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp index 031e499b545..91840cd2c50 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp @@ -20,47 +20,47 @@ namespace ErrorCodes namespace { -parquet::ParquetVersion::type getParquetVersion(const FormatSettings & settings) -{ - switch (settings.parquet.output_version) + parquet::ParquetVersion::type getParquetVersion(const FormatSettings & settings) { - case FormatSettings::ParquetVersion::V1_0: - return parquet::ParquetVersion::PARQUET_1_0; - case FormatSettings::ParquetVersion::V2_4: - return parquet::ParquetVersion::PARQUET_2_4; - case FormatSettings::ParquetVersion::V2_6: - return parquet::ParquetVersion::PARQUET_2_6; - case FormatSettings::ParquetVersion::V2_LATEST: - return parquet::ParquetVersion::PARQUET_2_LATEST; + switch (settings.parquet.output_version) + { + case FormatSettings::ParquetVersion::V1_0: + return parquet::ParquetVersion::PARQUET_1_0; + case FormatSettings::ParquetVersion::V2_4: + return parquet::ParquetVersion::PARQUET_2_4; + case FormatSettings::ParquetVersion::V2_6: + return parquet::ParquetVersion::PARQUET_2_6; + case FormatSettings::ParquetVersion::V2_LATEST: + return parquet::ParquetVersion::PARQUET_2_LATEST; + } } -} -parquet::Compression::type getParquetCompression(FormatSettings::ParquetCompression method) -{ - if (method == FormatSettings::ParquetCompression::NONE) - return parquet::Compression::type::UNCOMPRESSED; + parquet::Compression::type getParquetCompression(FormatSettings::ParquetCompression method) + { + if (method == FormatSettings::ParquetCompression::NONE) + return parquet::Compression::type::UNCOMPRESSED; #if USE_SNAPPY - if (method == FormatSettings::ParquetCompression::SNAPPY) - return parquet::Compression::type::SNAPPY; + if (method == FormatSettings::ParquetCompression::SNAPPY) + return parquet::Compression::type::SNAPPY; #endif #if USE_BROTLI - if (method == FormatSettings::ParquetCompression::BROTLI) - return parquet::Compression::type::BROTLI; + if (method == FormatSettings::ParquetCompression::BROTLI) + return parquet::Compression::type::BROTLI; #endif - if (method == FormatSettings::ParquetCompression::ZSTD) - return parquet::Compression::type::ZSTD; + if (method == FormatSettings::ParquetCompression::ZSTD) + return parquet::Compression::type::ZSTD; - if (method == FormatSettings::ParquetCompression::LZ4) - return parquet::Compression::type::LZ4; + if (method == FormatSettings::ParquetCompression::LZ4) + return parquet::Compression::type::LZ4; - if (method == FormatSettings::ParquetCompression::GZIP) - return parquet::Compression::type::GZIP; + if (method == FormatSettings::ParquetCompression::GZIP) + return parquet::Compression::type::GZIP; - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported compression method"); -} + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported compression method"); + } } @@ -69,70 +69,9 @@ ParquetBlockOutputFormat::ParquetBlockOutputFormat(WriteBuffer & out_, const Blo { } -void ParquetBlockOutputFormat::consume(Chunk chunk) +void ParquetBlockOutputFormat::consumeStaged() { - /// Do something like SquashingTransform to produce big enough row groups. - /// Because the real SquashingTransform is only used for INSERT, not for SELECT ... INTO OUTFILE. - /// The latter doesn't even have a pipeline where a transform could be inserted, so it's more - /// convenient to do the squashing here. - - appendToAccumulatedChunk(std::move(chunk)); - - if (!accumulated_chunk) - return; - - const size_t target_rows = std::max(static_cast(1), format_settings.parquet.row_group_rows); - - if (accumulated_chunk.getNumRows() < target_rows && - accumulated_chunk.bytes() < format_settings.parquet.row_group_bytes) - return; - - /// Increase row group size slightly (by < 2x) to avoid adding a small row groups for the - /// remainder of the new chunk. - /// E.g. suppose input chunks are 70K rows each, and max_rows = 1M. Then we'll have - /// getNumRows() = 1.05M. We want to write all 1.05M as one row group instead of 1M and 0.05M. - size_t num_row_groups = std::max(static_cast(1), accumulated_chunk.getNumRows() / target_rows); - size_t row_group_size = (accumulated_chunk.getNumRows() - 1) / num_row_groups + 1; // round up - - write(std::move(accumulated_chunk), row_group_size); - accumulated_chunk.clear(); -} - -void ParquetBlockOutputFormat::finalizeImpl() -{ - if (accumulated_chunk) - write(std::move(accumulated_chunk), format_settings.parquet.row_group_rows); - - if (!file_writer) - { - Block header = materializeBlock(getPort(PortKind::Main).getHeader()); - write(Chunk(header.getColumns(), 0), 1); - } - - auto status = file_writer->Close(); - if (!status.ok()) - throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while closing a table: {}", status.ToString()); -} - -void ParquetBlockOutputFormat::resetFormatterImpl() -{ - file_writer.reset(); -} - -void ParquetBlockOutputFormat::appendToAccumulatedChunk(Chunk chunk) -{ - if (!accumulated_chunk) - { - accumulated_chunk = std::move(chunk); - return; - } - chassert(accumulated_chunk.getNumColumns() == chunk.getNumColumns()); - accumulated_chunk.append(chunk); -} - -void ParquetBlockOutputFormat::write(Chunk chunk, size_t row_group_size) -{ - const size_t columns_num = chunk.getNumColumns(); + const size_t columns_num = staging_chunks.at(0).getNumColumns(); std::shared_ptr arrow_table; if (!ch_column_to_arrow_column) @@ -146,7 +85,7 @@ void ParquetBlockOutputFormat::write(Chunk chunk, size_t row_group_size) format_settings.parquet.output_fixed_string_as_fixed_byte_array); } - ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num); + ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, staging_chunks, columns_num); if (!file_writer) { @@ -173,12 +112,66 @@ void ParquetBlockOutputFormat::write(Chunk chunk, size_t row_group_size) file_writer = std::move(result.ValueOrDie()); } - auto status = file_writer->WriteTable(*arrow_table, row_group_size); + // TODO: calculate row_group_size depending on a number of rows and table size + + // allow slightly bigger than row_group_size to avoid a very small tail row group + auto status = file_writer->WriteTable(*arrow_table, std::max(format_settings.parquet.row_group_rows, staging_rows)); if (!status.ok()) throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while writing a table: {}", status.ToString()); } +void ParquetBlockOutputFormat::consume(Chunk chunk) +{ + /// Do something like SquashingTransform to produce big enough row groups. + /// Because the real SquashingTransform is only used for INSERT, not for SELECT ... INTO OUTFILE. + /// The latter doesn't even have a pipeline where a transform could be inserted, so it's more + /// convenient to do the squashing here. + staging_rows += chunk.getNumRows(); + staging_bytes += chunk.bytes(); + staging_chunks.push_back(std::move(chunk)); + chassert(staging_chunks.back().getNumColumns() == staging_chunks.front().getNumColumns()); + if (staging_rows < format_settings.parquet.row_group_rows && + staging_bytes < format_settings.parquet.row_group_bytes) + { + return; + } + else + { + consumeStaged(); + staging_chunks.clear(); + staging_rows = 0; + staging_bytes = 0; + } +} + +void ParquetBlockOutputFormat::finalizeImpl() +{ + if (!file_writer && staging_chunks.empty()) + { + Block header = materializeBlock(getPort(PortKind::Main).getHeader()); + + consume(Chunk(header.getColumns(), 0)); // this will make staging_chunks non-empty + } + + if (!staging_chunks.empty()) + { + consumeStaged(); + staging_chunks.clear(); + staging_rows = 0; + staging_bytes = 0; + } + + auto status = file_writer->Close(); + if (!status.ok()) + throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while closing a table: {}", status.ToString()); +} + +void ParquetBlockOutputFormat::resetFormatterImpl() +{ + file_writer.reset(); +} + void registerOutputFormatParquet(FormatFactory & factory) { factory.registerOutputFormat( diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.h b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.h index 0fb7013e6d6..482c778bc52 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.h @@ -34,18 +34,19 @@ public: String getContentType() const override { return "application/octet-stream"; } private: + void consumeStaged(); void consume(Chunk) override; - void appendToAccumulatedChunk(Chunk chunk); - void write(Chunk chunk, size_t row_group_size); void finalizeImpl() override; void resetFormatterImpl() override; + std::vector staging_chunks; + size_t staging_rows = 0; + size_t staging_bytes = 0; + const FormatSettings format_settings; std::unique_ptr file_writer; std::unique_ptr ch_column_to_arrow_column; - - Chunk accumulated_chunk; }; } From 20b66689f3c912757540c4e91589b7ffd6fe3593 Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 10 Jun 2023 13:34:51 +0200 Subject: [PATCH 0736/1072] Fix test --- docker/test/upgrade/run.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/docker/test/upgrade/run.sh b/docker/test/upgrade/run.sh index bd0c59a12cd..8353d03fc69 100644 --- a/docker/test/upgrade/run.sh +++ b/docker/test/upgrade/run.sh @@ -59,12 +59,6 @@ install_packages previous_release_package_folder # available for dump via clickhouse-local configure -# local_blob_storage disk type does not exist in older versions -sudo cat /etc/clickhouse-server/config.d/storage_conf.xml \ - | sed "s|local_blob_storage|local|" \ - > /etc/clickhouse-server/config.d/storage_conf.xml.tmp -sudo mv /etc/clickhouse-server/config.d/storage_conf.xml.tmp /etc/clickhouse-server/config.d/storage_conf.xml - # it contains some new settings, but we can safely remove it rm /etc/clickhouse-server/config.d/merge_tree.xml From ff96c4c0d8898c15e1aea876267c65ec8b0c69f0 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sat, 10 Jun 2023 12:09:47 +0000 Subject: [PATCH 0737/1072] Fix black --- tests/integration/test_non_default_compression/test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_non_default_compression/test.py b/tests/integration/test_non_default_compression/test.py index e1a9c1ae540..18e2eb43813 100644 --- a/tests/integration/test_non_default_compression/test.py +++ b/tests/integration/test_non_default_compression/test.py @@ -50,6 +50,7 @@ node7 = cluster.add_instance( user_configs=["configs/allow_suspicious_codecs.xml"], ) + @pytest.fixture(scope="module") def start_cluster(): try: @@ -252,6 +253,7 @@ def test_uncompressed_cache_plus_zstd_codec(start_cluster): == "10000\n" ) + def test_preconfigured_deflateqpl_codec(start_cluster): node6.query( """ @@ -268,7 +270,7 @@ def test_preconfigured_deflateqpl_codec(start_cluster): ) assert ( node6.query( - "SELECT COUNT(*) FROM compression_codec_multiple_with_key WHERE id % 2 == 0" + "SELECT COUNT(*) FROM compression_codec_multiple_with_key WHERE id % 2 == 0" ) == "2\n" ) @@ -296,9 +298,7 @@ def test_preconfigured_deflateqpl_codec(start_cluster): == "1001\n" ) assert ( - node6.query( - "SELECT SUM(somecolumn) FROM compression_codec_multiple_with_key" - ) + node6.query("SELECT SUM(somecolumn) FROM compression_codec_multiple_with_key") == str(777.777 + 88.88 + 99.99 + 1.0 * 10000) + "\n" ) assert ( From 1f3e923528c25ddab273243fa80a6d0838c568c7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 10 Jun 2023 15:23:41 +0300 Subject: [PATCH 0738/1072] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eb08a8ec100..7142ad26e15 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,7 +31,7 @@ * Introduces new keyword `INTO OUTFILE 'file.txt' APPEND`. [#48880](https://github.com/ClickHouse/ClickHouse/pull/48880) ([alekar](https://github.com/alekar)). * Added `system.zookeeper_connection` table that shows information about Keeper connections. [#45245](https://github.com/ClickHouse/ClickHouse/pull/45245) ([mateng915](https://github.com/mateng0915)). * Add new function `generateRandomStructure` that generates random table structure. It can be used in combination with table function `generateRandom`. [#47409](https://github.com/ClickHouse/ClickHouse/pull/47409) ([Kruglov Pavel](https://github.com/Avogar)). -* Allow the use of `CASE` without an `ELSE` branch and extended `transform` to deal with more types. Also fix some issues that made transform() return incorrect results when decimal types were mixed with other numeric types. [#48300](https://github.com/ClickHouse/ClickHouse/pull/48300) ([Salvatore Mesoraca](https://github.com/aiven-sal)). +* Allow the use of `CASE` without an `ELSE` branch and extended `transform` to deal with more types. Also fix some issues that made transform() return incorrect results when decimal types were mixed with other numeric types. [#48300](https://github.com/ClickHouse/ClickHouse/pull/48300) ([Salvatore Mesoraca](https://github.com/aiven-sal)). This closes #2655. This closes #9596. This closes #38666. * Added [server-side encryption using KMS keys](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html) with S3 tables, and the `header` setting with S3 disks. Closes [#48723](https://github.com/ClickHouse/ClickHouse/issues/48723). [#48724](https://github.com/ClickHouse/ClickHouse/pull/48724) ([Johann Gan](https://github.com/johanngan)). * Add MemoryTracker for the background tasks (merges and mutation). Introduces `merges_mutations_memory_usage_soft_limit` and `merges_mutations_memory_usage_to_ram_ratio` settings that represent the soft memory limit for merges and mutations. If this limit is reached ClickHouse won't schedule new merge or mutation tasks. Also `MergesMutationsMemoryTracking` metric is introduced to allow observing current memory usage of background tasks. Resubmit [#46089](https://github.com/ClickHouse/ClickHouse/issues/46089). Closes [#48774](https://github.com/ClickHouse/ClickHouse/issues/48774). [#48787](https://github.com/ClickHouse/ClickHouse/pull/48787) ([Dmitry Novik](https://github.com/novikd)). * Function `dotProduct` work for array. [#49050](https://github.com/ClickHouse/ClickHouse/pull/49050) ([FFFFFFFHHHHHHH](https://github.com/FFFFFFFHHHHHHH)). From 818e081162e77045468d8349ad5c438b261b02d5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 10 Jun 2023 14:30:36 +0200 Subject: [PATCH 0739/1072] Fill gaps on the dashboard --- programs/server/dashboard.html | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/programs/server/dashboard.html b/programs/server/dashboard.html index 97b35ec97c4..951b7db3aa3 100644 --- a/programs/server/dashboard.html +++ b/programs/server/dashboard.html @@ -449,7 +449,7 @@ let queries = [ FROM system.metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` }, { "title": "CPU Usage (cores)", @@ -457,7 +457,7 @@ ORDER BY t` FROM system.metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` }, { "title": "Queries Running", @@ -465,7 +465,7 @@ ORDER BY t` FROM system.metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` }, { "title": "Merges Running", @@ -473,7 +473,7 @@ ORDER BY t` FROM system.metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` }, { "title": "Selected Bytes/second", @@ -481,7 +481,7 @@ ORDER BY t` FROM system.metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` }, { "title": "IO Wait", @@ -489,7 +489,7 @@ ORDER BY t` FROM system.metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` }, { "title": "CPU Wait", @@ -497,7 +497,7 @@ ORDER BY t` FROM system.metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` }, { "title": "OS CPU Usage (Userspace)", @@ -506,7 +506,7 @@ FROM system.asynchronous_metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'OSUserTimeNormalized' GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` }, { "title": "OS CPU Usage (Kernel)", @@ -515,7 +515,7 @@ FROM system.asynchronous_metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'OSSystemTimeNormalized' GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` }, { "title": "Read From Disk", @@ -523,7 +523,7 @@ ORDER BY t` FROM system.metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` }, { "title": "Read From Filesystem", @@ -531,7 +531,7 @@ ORDER BY t` FROM system.metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` }, { "title": "Memory (tracked)", @@ -539,7 +539,7 @@ ORDER BY t` FROM system.metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` }, { "title": "Load Average (15 minutes)", @@ -548,7 +548,7 @@ FROM system.asynchronous_metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'LoadAverage15' GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` }, { "title": "Selected Rows/second", @@ -556,7 +556,7 @@ ORDER BY t` FROM system.metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` }, { "title": "Inserted Rows/second", @@ -564,7 +564,7 @@ ORDER BY t` FROM system.metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` }, { "title": "Total MergeTree Parts", @@ -573,7 +573,7 @@ FROM system.asynchronous_metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'TotalPartsOfMergeTreeTables' GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` }, { "title": "Max Parts For Partition", @@ -582,7 +582,7 @@ FROM system.asynchronous_metric_log WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'MaxPartCountForPartition' GROUP BY t -ORDER BY t` +ORDER BY t WITH FILL STEP {rounding:UInt32}` } ]; From dc4a2fb07d29a4fe5818a42aa6c17be9bd59ccb7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 10 Jun 2023 14:47:24 +0200 Subject: [PATCH 0740/1072] Add tests for function "transform" --- .../02786_transform_float.reference | 10 +++++ .../0_stateless/02786_transform_float.sql | 3 ++ .../02787_transform_null.reference | 9 +++++ .../0_stateless/02787_transform_null.sql | 40 +++++++++++++++++++ 4 files changed, 62 insertions(+) create mode 100644 tests/queries/0_stateless/02786_transform_float.reference create mode 100644 tests/queries/0_stateless/02786_transform_float.sql create mode 100644 tests/queries/0_stateless/02787_transform_null.reference create mode 100644 tests/queries/0_stateless/02787_transform_null.sql diff --git a/tests/queries/0_stateless/02786_transform_float.reference b/tests/queries/0_stateless/02786_transform_float.reference new file mode 100644 index 00000000000..3fbb2492f2e --- /dev/null +++ b/tests/queries/0_stateless/02786_transform_float.reference @@ -0,0 +1,10 @@ +1 +1 +1 +--- +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/02786_transform_float.sql b/tests/queries/0_stateless/02786_transform_float.sql new file mode 100644 index 00000000000..4229425b084 --- /dev/null +++ b/tests/queries/0_stateless/02786_transform_float.sql @@ -0,0 +1,3 @@ +select transform(number, [1], [toFloat32(1)], toFloat32(1)) from numbers(3); +SELECT '---'; +select transform(number, [3], [toFloat32(1)], toFloat32(1)) from numbers(6); diff --git a/tests/queries/0_stateless/02787_transform_null.reference b/tests/queries/0_stateless/02787_transform_null.reference new file mode 100644 index 00000000000..a650dbbd173 --- /dev/null +++ b/tests/queries/0_stateless/02787_transform_null.reference @@ -0,0 +1,9 @@ +ZERO +ZERO +ONE +ONE +a +a + \N 0 \N 0 \N +1 1 1 \N 1 1 +a \N 3 3 3 3 diff --git a/tests/queries/0_stateless/02787_transform_null.sql b/tests/queries/0_stateless/02787_transform_null.sql new file mode 100644 index 00000000000..64a771f0f4b --- /dev/null +++ b/tests/queries/0_stateless/02787_transform_null.sql @@ -0,0 +1,40 @@ +SELECT transform(0, [0, 1], ['ZERO', 'ONE'], 'DEFAULT') AS result; +SELECT transform(0, [0, 1], ['ZERO', 'ONE'], NULL) AS result; + +SELECT CASE 1 + WHEN 0 THEN 'ZERO' + WHEN 1 THEN 'ONE' + ELSE 'NONE' +END AS result; + +SELECT CASE 1 + WHEN 0 THEN NULL + WHEN 1 THEN 'ONE' + ELSE 'NONE' +END AS result; + +select + case 1 + when 1 then 'a' + else 'b' + end value; + +select + case 1 + when 1 then 'a' + end value; + +SELECT + d, + toInt16OrNull(d), + caseWithExpression(d, 'a', 3, toInt16OrZero(d)) AS case_zero, + caseWithExpression(d, 'a', 3, toInt16OrNull(d)) AS case_null, + if(d = 'a', 3, toInt16OrZero(d)) AS if_zero, + if(d = 'a', 3, toInt16OrNull(d)) AS if_null +FROM +( + SELECT arrayJoin(['', '1', 'a']) AS d +) +ORDER BY + case_zero ASC, + d ASC; From a614aa3b03b365c769f955f22b788181a8cf36ab Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 10 Jun 2023 16:06:37 +0200 Subject: [PATCH 0741/1072] More leftovers --- docker/test/upgrade/run.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docker/test/upgrade/run.sh b/docker/test/upgrade/run.sh index 8353d03fc69..6f7d3999f1d 100644 --- a/docker/test/upgrade/run.sh +++ b/docker/test/upgrade/run.sh @@ -86,11 +86,6 @@ export USE_S3_STORAGE_FOR_MERGE_TREE=1 export ZOOKEEPER_FAULT_INJECTION=0 configure -sudo cat /etc/clickhouse-server/config.d/storage_conf.xml \ - | sed "s|local_blob_storage|local|" \ - > /etc/clickhouse-server/config.d/storage_conf.xml.tmp -sudo mv /etc/clickhouse-server/config.d/storage_conf.xml.tmp /etc/clickhouse-server/config.d/storage_conf.xml - # it contains some new settings, but we can safely remove it rm /etc/clickhouse-server/config.d/merge_tree.xml From cb8c20722b8976fe0bc402498667b02c2585cc02 Mon Sep 17 00:00:00 2001 From: tpanetti Date: Sat, 10 Jun 2023 08:35:51 -0700 Subject: [PATCH 0742/1072] Rename setting and description for MySQL compatible types This renames the setting for MySQL compatible types from output_format_mysql_types to use_mysql_types_in_show_columns --- src/Core/Settings.h | 2 +- src/Storages/System/StorageSystemColumns.cpp | 4 ++-- .../0_stateless/02775_show_columns_mysql_compatibility.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index b72fc037fbb..d47015ebb39 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -196,7 +196,7 @@ class IColumn; M(Bool, allow_experimental_inverted_index, false, "If it is set to true, allow to use experimental inverted index.", 0) \ \ M(UInt64, mysql_max_rows_to_insert, 65536, "The maximum number of rows in MySQL batch insertion of the MySQL storage engine", 0) \ - M(Bool, output_format_mysql_types, false, "Use MySQL converted types when connected via MySQL compatibility", 0) \ + M(Bool, use_mysql_types_in_show_columns, false, "Use MySQL converted types when connected via MySQL compatibility for show columns query", 0) \ \ M(UInt64, optimize_min_equality_disjunction_chain_length, 3, "The minimum length of the expression `expr = x1 OR ... expr = xN` for optimization ", 0) \ \ diff --git a/src/Storages/System/StorageSystemColumns.cpp b/src/Storages/System/StorageSystemColumns.cpp index 684c35709a4..e4ca6a15138 100644 --- a/src/Storages/System/StorageSystemColumns.cpp +++ b/src/Storages/System/StorageSystemColumns.cpp @@ -75,7 +75,7 @@ public: , columns_mask(std::move(columns_mask_)), max_block_size(max_block_size_) , databases(std::move(databases_)), tables(std::move(tables_)), storages(std::move(storages_)) , client_info_interface(context->getClientInfo().interface) - , use_mysql_types(context->getSettingsRef().output_format_mysql_types) + , use_mysql_types(context->getSettingsRef().use_mysql_types_in_show_columns) , total_tables(tables->size()), access(context->getAccess()) , query_id(context->getCurrentQueryId()), lock_acquire_timeout(context->getSettingsRef().lock_acquire_timeout) { @@ -133,7 +133,7 @@ protected: auto get_type_name = [this](const IDataType& type) -> std::string { - // Check if the output_format_mysql_types setting is enabled and client is connected via MySQL protocol + // Check if the use_mysql_types_in_show_columns setting is enabled and client is connected via MySQL protocol if (use_mysql_types && client_info_interface == DB::ClientInfo::Interface::MYSQL) { return type.getSQLCompatibleName(); diff --git a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh index 6a546c47a38..51c9da2a842 100755 --- a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh +++ b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh @@ -127,7 +127,7 @@ TEMP_FILE=$(mktemp) cat < $TEMP_FILE SHOW COLUMNS FROM tab; -SET output_format_mysql_types=1; +SET use_mysql_types_in_show_columns=1; SHOW COLUMNS FROM tab; SHOW EXTENDED COLUMNS FROM tab; SHOW FULL COLUMNS FROM tab; From 7cb6f3c72279878db1ad65c5ea9670287cc42d16 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 10 Jun 2023 17:43:42 +0200 Subject: [PATCH 0743/1072] Rename Cpu to CPU and Cfs to CFS --- src/Common/AsynchronousMetrics.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Common/AsynchronousMetrics.cpp b/src/Common/AsynchronousMetrics.cpp index 6821647a180..531f0b04aa1 100644 --- a/src/Common/AsynchronousMetrics.cpp +++ b/src/Common/AsynchronousMetrics.cpp @@ -962,8 +962,8 @@ void AsynchronousMetrics::update(TimePoint update_time) period = std::stoull(field2); } - new_values["CGroupCpuCfsPeriod"] = { period, "The CFS period of CPU cgroup."}; - new_values["CGroupCpuCfsQuota"] = { quota, "The CFS quota of CPU cgroup. If stated zero, the quota is max."}; + new_values["CGroupCPUCFSPeriod"] = { period, "The CFS period of CPU cgroup."}; + new_values["CGroupCPUCFSQuota"] = { quota, "The CFS quota of CPU cgroup. If stated zero, the quota is max."}; } catch (...) { @@ -982,8 +982,8 @@ void AsynchronousMetrics::update(TimePoint update_time) tryReadText(quota, *cgroupcpu_cfs_quota); tryReadText(period, *cgroupcpu_cfs_period); - new_values["CGroupCpuCfsPeriod"] = { period, "The CFS period of CPU cgroup."}; - new_values["CGroupCpuCfsQuota"] = { quota, "The CFS quota of CPU cgroup. If stated zero, the quota is max."}; + new_values["CGroupCPUCFSPeriod"] = { period, "The CFS period of CPU cgroup."}; + new_values["CGroupCPUCFSQuota"] = { quota, "The CFS quota of CPU cgroup. If stated zero, the quota is max."}; } catch (...) { From e785c6796d8c1694e0f820e15772fa59bf31cbf0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 10 Jun 2023 17:54:46 +0200 Subject: [PATCH 0744/1072] Replace CGroups CPU metrics to one --- src/Common/AsynchronousMetrics.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Common/AsynchronousMetrics.cpp b/src/Common/AsynchronousMetrics.cpp index 531f0b04aa1..c610034a6b0 100644 --- a/src/Common/AsynchronousMetrics.cpp +++ b/src/Common/AsynchronousMetrics.cpp @@ -939,7 +939,8 @@ void AsynchronousMetrics::update(TimePoint update_time) if (cgroupcpu_max) { - try { + try + { cgroupcpu_max->rewind(); uint64_t quota = 0; @@ -962,8 +963,7 @@ void AsynchronousMetrics::update(TimePoint update_time) period = std::stoull(field2); } - new_values["CGroupCPUCFSPeriod"] = { period, "The CFS period of CPU cgroup."}; - new_values["CGroupCPUCFSQuota"] = { quota, "The CFS quota of CPU cgroup. If stated zero, the quota is max."}; + new_values["CGroupMaxCPU"] = { static_cast(quota) / period, "The maximum number of CPU cores according to CGroups."}; } catch (...) { @@ -972,7 +972,8 @@ void AsynchronousMetrics::update(TimePoint update_time) } else if (cgroupcpu_cfs_quota && cgroupcpu_cfs_period) { - try { + try + { cgroupcpu_cfs_quota->rewind(); cgroupcpu_cfs_period->rewind(); @@ -982,8 +983,7 @@ void AsynchronousMetrics::update(TimePoint update_time) tryReadText(quota, *cgroupcpu_cfs_quota); tryReadText(period, *cgroupcpu_cfs_period); - new_values["CGroupCPUCFSPeriod"] = { period, "The CFS period of CPU cgroup."}; - new_values["CGroupCPUCFSQuota"] = { quota, "The CFS quota of CPU cgroup. If stated zero, the quota is max."}; + new_values["CGroupMaxCPU"] = { static_cast(quota) / period, "The maximum number of CPU cores according to CGroups."}; } catch (...) { From a91b84e60c1904bdd6fe7bdfd82869ad07e6bb94 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 10 Jun 2023 18:00:47 +0200 Subject: [PATCH 0745/1072] Slightly better --- src/Common/AsynchronousMetrics.cpp | 116 +++++++++++++++-------------- 1 file changed, 62 insertions(+), 54 deletions(-) diff --git a/src/Common/AsynchronousMetrics.cpp b/src/Common/AsynchronousMetrics.cpp index c610034a6b0..de9800aa896 100644 --- a/src/Common/AsynchronousMetrics.cpp +++ b/src/Common/AsynchronousMetrics.cpp @@ -727,6 +727,68 @@ void AsynchronousMetrics::update(TimePoint update_time) } } + Float64 max_cpu_cgroups = 0; + if (cgroupcpu_max) + { + try + { + cgroupcpu_max->rewind(); + + uint64_t quota = 0; + uint64_t period = 0; + + std::string line; + readText(line, *cgroupcpu_max); + + auto space = line.find(' '); + + if (line.rfind("max", space) == std::string::npos) + { + auto field1 = line.substr(0, space); + quota = std::stoull(field1); + } + + if (space != std::string::npos) + { + auto field2 = line.substr(space + 1); + period = std::stoull(field2); + } + + if (quota > 0 && period > 0) + max_cpu_cgroups = static_cast(quota) / period; + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } + else if (cgroupcpu_cfs_quota && cgroupcpu_cfs_period) + { + try + { + cgroupcpu_cfs_quota->rewind(); + cgroupcpu_cfs_period->rewind(); + + uint64_t quota = 0; + uint64_t period = 0; + + tryReadText(quota, *cgroupcpu_cfs_quota); + tryReadText(period, *cgroupcpu_cfs_period); + + if (quota > 0 && period > 0) + max_cpu_cgroups = static_cast(quota) / period; + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } + + if (max_cpu_cgroups > 0) + { + new_values["CGroupMaxCPU"] = { max_cpu_cgroups, "The maximum number of CPU cores according to CGroups."}; + } + if (proc_stat) { try @@ -937,60 +999,6 @@ void AsynchronousMetrics::update(TimePoint update_time) } } - if (cgroupcpu_max) - { - try - { - cgroupcpu_max->rewind(); - - uint64_t quota = 0; - uint64_t period = 0; - - std::string line; - readText(line, *cgroupcpu_max); - - auto space = line.find(' '); - - if (line.rfind("max", space) == std::string::npos) - { - auto field1 = line.substr(0, space); - quota = std::stoull(field1); - } - - if (space != std::string::npos) - { - auto field2 = line.substr(space + 1); - period = std::stoull(field2); - } - - new_values["CGroupMaxCPU"] = { static_cast(quota) / period, "The maximum number of CPU cores according to CGroups."}; - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - } - } - else if (cgroupcpu_cfs_quota && cgroupcpu_cfs_period) - { - try - { - cgroupcpu_cfs_quota->rewind(); - cgroupcpu_cfs_period->rewind(); - - uint64_t quota = 0; - uint64_t period = 0; - - tryReadText(quota, *cgroupcpu_cfs_quota); - tryReadText(period, *cgroupcpu_cfs_period); - - new_values["CGroupMaxCPU"] = { static_cast(quota) / period, "The maximum number of CPU cores according to CGroups."}; - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - } - } - if (meminfo) { try From ddd2257cf51edf0cf0fb264b1d010e7436d7e94b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 10 Jun 2023 18:03:17 +0200 Subject: [PATCH 0746/1072] Normalize with respect to CGroups --- src/Common/AsynchronousMetrics.cpp | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/Common/AsynchronousMetrics.cpp b/src/Common/AsynchronousMetrics.cpp index de9800aa896..36c87010fa5 100644 --- a/src/Common/AsynchronousMetrics.cpp +++ b/src/Common/AsynchronousMetrics.cpp @@ -933,36 +933,38 @@ void AsynchronousMetrics::update(TimePoint update_time) /// Also write values normalized to 0..1 by diving to the number of CPUs. /// These values are good to be averaged across the cluster of non-uniform servers. - if (num_cpus) + Float64 num_cpus_to_normalize = max_cpu_cgroups > 0 ? max_cpu_cgroups : num_cpus; + + if (num_cpus_to_normalize > 0) { - new_values["OSUserTimeNormalized"] = { delta_values_all_cpus.user * multiplier / num_cpus, + new_values["OSUserTimeNormalized"] = { delta_values_all_cpus.user * multiplier / num_cpus_to_normalize, "The value is similar to `OSUserTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; - new_values["OSNiceTimeNormalized"] = { delta_values_all_cpus.nice * multiplier / num_cpus, + new_values["OSNiceTimeNormalized"] = { delta_values_all_cpus.nice * multiplier / num_cpus_to_normalize, "The value is similar to `OSNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; - new_values["OSSystemTimeNormalized"] = { delta_values_all_cpus.system * multiplier / num_cpus, + new_values["OSSystemTimeNormalized"] = { delta_values_all_cpus.system * multiplier / num_cpus_to_normalize, "The value is similar to `OSSystemTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; - new_values["OSIdleTimeNormalized"] = { delta_values_all_cpus.idle * multiplier / num_cpus, + new_values["OSIdleTimeNormalized"] = { delta_values_all_cpus.idle * multiplier / num_cpus_to_normalize, "The value is similar to `OSIdleTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; - new_values["OSIOWaitTimeNormalized"] = { delta_values_all_cpus.iowait * multiplier / num_cpus, + new_values["OSIOWaitTimeNormalized"] = { delta_values_all_cpus.iowait * multiplier / num_cpus_to_normalize, "The value is similar to `OSIOWaitTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; - new_values["OSIrqTimeNormalized"] = { delta_values_all_cpus.irq * multiplier / num_cpus, + new_values["OSIrqTimeNormalized"] = { delta_values_all_cpus.irq * multiplier / num_cpus_to_normalize, "The value is similar to `OSIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; - new_values["OSSoftIrqTimeNormalized"] = { delta_values_all_cpus.softirq * multiplier / num_cpus, + new_values["OSSoftIrqTimeNormalized"] = { delta_values_all_cpus.softirq * multiplier / num_cpus_to_normalize, "The value is similar to `OSSoftIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; - new_values["OSStealTimeNormalized"] = { delta_values_all_cpus.steal * multiplier / num_cpus, + new_values["OSStealTimeNormalized"] = { delta_values_all_cpus.steal * multiplier / num_cpus_to_normalize, "The value is similar to `OSStealTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; - new_values["OSGuestTimeNormalized"] = { delta_values_all_cpus.guest * multiplier / num_cpus, + new_values["OSGuestTimeNormalized"] = { delta_values_all_cpus.guest * multiplier / num_cpus_to_normalize, "The value is similar to `OSGuestTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; - new_values["OSGuestNiceTimeNormalized"] = { delta_values_all_cpus.guest_nice * multiplier / num_cpus, + new_values["OSGuestNiceTimeNormalized"] = { delta_values_all_cpus.guest_nice * multiplier / num_cpus_to_normalize, "The value is similar to `OSGuestNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; } From 72b9d75a84476d7ee402de8a160f8a22b9ccdb59 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sat, 10 Jun 2023 12:53:02 +0000 Subject: [PATCH 0747/1072] Add compat setting for non-const timezones SQL function toTimezone() converts a Date or DateTime into another timezone. The problem is that the timezone is part of the Date / DateTime type but not part of the internal representation (value). This led to the fact that toTimeZone() wqith non-const timezones produced wrong and misleading results until #48471 (shipped with v23.4) enforced a const timezone. Unfortunately, this PR also broke existing table definitions with non-const timezones, e.g. in ALIAS expressions. So while #48471 addressed the issue appropriately, it is really backwards-incompatible. This PR adds a setting to toggle the behavior and makes it also part of the compatibility profile. --- src/Core/Settings.h | 1 + src/Core/SettingsChangesHistory.h | 1 + .../FunctionDateOrDateTimeAddInterval.h | 4 +- .../FunctionDateOrDateTimeToDateOrDate32.h | 2 +- ...tionDateOrDateTimeToDateTimeOrDateTime64.h | 2 +- .../FunctionDateOrDateTimeToSomething.h | 4 +- src/Functions/FunctionSnowflake.h | 17 ++++-- src/Functions/FunctionUnixTimestamp64.h | 13 +++-- src/Functions/FunctionsConversion.h | 10 ++-- src/Functions/FunctionsTimeWindow.cpp | 4 +- src/Functions/date_trunc.cpp | 2 +- .../extractTimeZoneFromFunctionArguments.cpp | 5 +- .../extractTimeZoneFromFunctionArguments.h | 10 +++- src/Functions/fromUnixTimestamp64Micro.cpp | 4 +- src/Functions/fromUnixTimestamp64Milli.cpp | 4 +- src/Functions/fromUnixTimestamp64Nano.cpp | 4 +- src/Functions/now.cpp | 12 +++-- src/Functions/now64.cpp | 10 +++- src/Functions/nowInBlock.cpp | 12 +++-- src/Functions/snowflake.cpp | 8 +-- src/Functions/timeSlots.cpp | 4 +- src/Functions/toStartOfInterval.cpp | 4 +- src/Functions/toTimezone.cpp | 11 +++- .../00515_enhanced_time_zones.reference | 7 +++ .../0_stateless/00515_enhanced_time_zones.sql | 54 ++++++++++++++----- 25 files changed, 147 insertions(+), 62 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 464b9168a4c..d2e6a470a91 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -147,6 +147,7 @@ class IColumn; M(Bool, enable_memory_bound_merging_of_aggregation_results, true, "Enable memory bound merging strategy for aggregation.", 0) \ M(Bool, enable_positional_arguments, true, "Enable positional arguments in ORDER BY, GROUP BY and LIMIT BY", 0) \ M(Bool, enable_extended_results_for_datetime_functions, false, "Enable date functions like toLastDayOfMonth return Date32 results (instead of Date results) for Date32/DateTime64 arguments.", 0) \ + M(Bool, allow_nonconst_timezone_arguments, false, "Allow non-const timezone arguments in certain time-related functions like toTimeZone(), fromUnixTimestamp*(), snowflakeToDateTime*()", 0) \ \ M(Bool, group_by_use_nulls, false, "Treat columns mentioned in ROLLUP, CUBE or GROUPING SETS as Nullable", 0) \ \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index c0f10b13282..9fd45ac16d6 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -85,6 +85,7 @@ static std::map sett {"use_with_fill_by_sorting_prefix", false, true, "Columns preceding WITH FILL columns in ORDER BY clause form sorting prefix. Rows with different values in sorting prefix are filled independently"}, {"output_format_parquet_compliant_nested_types", false, true, "Change an internal field name in output Parquet file schema."}}}, {"23.4", {{"allow_suspicious_indices", true, false, "If true, index can defined with identical expressions"}, + {"allow_nonconst_timezone_arguments", true, false, "Allow non-const timezone arguments in certain time-related functions like toTimeZone(), fromUnixTimestamp*(), snowflakeToDateTime*()."}, {"connect_timeout_with_failover_ms", 50, 1000, "Increase default connect timeout because of async connect"}, {"connect_timeout_with_failover_secure_ms", 100, 1000, "Increase default secure connect timeout because of async connect"}, {"hedged_connection_timeout_ms", 100, 50, "Start new connection in hedged requests after 50 ms instead of 100 to correspond with previous connect timeout"}}}, diff --git a/src/Functions/FunctionDateOrDateTimeAddInterval.h b/src/Functions/FunctionDateOrDateTimeAddInterval.h index 507dc37e266..1546c24d30c 100644 --- a/src/Functions/FunctionDateOrDateTimeAddInterval.h +++ b/src/Functions/FunctionDateOrDateTimeAddInterval.h @@ -679,7 +679,7 @@ public: } else if constexpr (std::is_same_v) { - return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 2, 0)); + return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 2, 0, false)); } else if constexpr (std::is_same_v) { @@ -696,7 +696,7 @@ public: return {}; }); - auto timezone = extractTimeZoneNameFromFunctionArguments(arguments, 2, 0); + auto timezone = extractTimeZoneNameFromFunctionArguments(arguments, 2, 0, false); if (const auto* datetime64_type = typeid_cast(arguments[0].type.get())) { const auto from_scale = datetime64_type->getScale(); diff --git a/src/Functions/FunctionDateOrDateTimeToDateOrDate32.h b/src/Functions/FunctionDateOrDateTimeToDateOrDate32.h index 8e006b93b98..6eb3e534b62 100644 --- a/src/Functions/FunctionDateOrDateTimeToDateOrDate32.h +++ b/src/Functions/FunctionDateOrDateTimeToDateOrDate32.h @@ -36,7 +36,7 @@ public: /// If the time zone is specified but empty, throw an exception. /// only validate the time_zone part if the number of arguments is 2. if ((which.isDateTime() || which.isDateTime64()) && arguments.size() == 2 - && extractTimeZoneNameFromFunctionArguments(arguments, 1, 0).empty()) + && extractTimeZoneNameFromFunctionArguments(arguments, 1, 0, false).empty()) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {} supports a 2nd argument (optional) that must be a valid time zone", this->getName()); diff --git a/src/Functions/FunctionDateOrDateTimeToDateTimeOrDateTime64.h b/src/Functions/FunctionDateOrDateTimeToDateTimeOrDateTime64.h index 3d1f0f192cf..9f1066fd687 100644 --- a/src/Functions/FunctionDateOrDateTimeToDateTimeOrDateTime64.h +++ b/src/Functions/FunctionDateOrDateTimeToDateTimeOrDateTime64.h @@ -34,7 +34,7 @@ public: WhichDataType which(from_type); - std::string time_zone = extractTimeZoneNameFromFunctionArguments(arguments, 1, 0); + std::string time_zone = extractTimeZoneNameFromFunctionArguments(arguments, 1, 0, false); /// If the time zone is specified but empty, throw an exception. /// only validate the time_zone part if the number of arguments is 2. diff --git a/src/Functions/FunctionDateOrDateTimeToSomething.h b/src/Functions/FunctionDateOrDateTimeToSomething.h index 47433d13e0b..82818cc3d2b 100644 --- a/src/Functions/FunctionDateOrDateTimeToSomething.h +++ b/src/Functions/FunctionDateOrDateTimeToSomething.h @@ -24,7 +24,7 @@ public: /// If the time zone is specified but empty, throw an exception. if constexpr (std::is_same_v) { - std::string time_zone = extractTimeZoneNameFromFunctionArguments(arguments, 1, 0); + std::string time_zone = extractTimeZoneNameFromFunctionArguments(arguments, 1, 0, false); /// only validate the time_zone part if the number of arguments is 2. This is mainly /// to accommodate functions like toStartOfDay(today()), toStartOfDay(yesterday()) etc. if (arguments.size() == 2 && time_zone.empty()) @@ -53,7 +53,7 @@ public: scale = std::max(source_scale, static_cast(9)); } - return std::make_shared(scale, extractTimeZoneNameFromFunctionArguments(arguments, 1, 0)); + return std::make_shared(scale, extractTimeZoneNameFromFunctionArguments(arguments, 1, 0, false)); } else return std::make_shared(); diff --git a/src/Functions/FunctionSnowflake.h b/src/Functions/FunctionSnowflake.h index 998db98890a..ce3a48269b4 100644 --- a/src/Functions/FunctionSnowflake.h +++ b/src/Functions/FunctionSnowflake.h @@ -6,6 +6,7 @@ #include #include #include +#include #include @@ -72,9 +73,13 @@ class FunctionSnowflakeToDateTime : public IFunction { private: const char * name; + const bool allow_nonconst_timezone_arguments; public: - explicit FunctionSnowflakeToDateTime(const char * name_) : name(name_) { } + explicit FunctionSnowflakeToDateTime(const char * name_, ContextPtr context) + : name(name_) + , allow_nonconst_timezone_arguments(context->getSettings().allow_nonconst_timezone_arguments) + {} String getName() const override { return name; } size_t getNumberOfArguments() const override { return 0; } @@ -92,7 +97,7 @@ public: std::string timezone; if (arguments.size() == 2) - timezone = extractTimeZoneNameFromFunctionArguments(arguments, 1, 0); + timezone = extractTimeZoneNameFromFunctionArguments(arguments, 1, 0, allow_nonconst_timezone_arguments); return std::make_shared(timezone); } @@ -162,9 +167,13 @@ class FunctionSnowflakeToDateTime64 : public IFunction { private: const char * name; + const bool allow_nonconst_timezone_arguments; public: - explicit FunctionSnowflakeToDateTime64(const char * name_) : name(name_) { } + explicit FunctionSnowflakeToDateTime64(const char * name_, ContextPtr context) + : name(name_) + , allow_nonconst_timezone_arguments(context->getSettings().allow_nonconst_timezone_arguments) + {} String getName() const override { return name; } size_t getNumberOfArguments() const override { return 0; } @@ -182,7 +191,7 @@ public: std::string timezone; if (arguments.size() == 2) - timezone = extractTimeZoneNameFromFunctionArguments(arguments, 1, 0); + timezone = extractTimeZoneNameFromFunctionArguments(arguments, 1, 0, allow_nonconst_timezone_arguments); return std::make_shared(3, timezone); } diff --git a/src/Functions/FunctionUnixTimestamp64.h b/src/Functions/FunctionUnixTimestamp64.h index 7519e46f9dc..58a23f7266e 100644 --- a/src/Functions/FunctionUnixTimestamp64.h +++ b/src/Functions/FunctionUnixTimestamp64.h @@ -6,6 +6,7 @@ #include #include #include +#include #include @@ -99,11 +100,13 @@ class FunctionFromUnixTimestamp64 : public IFunction private: size_t target_scale; const char * name; + const bool allow_nonconst_timezone_arguments; public: - FunctionFromUnixTimestamp64(size_t target_scale_, const char * name_) - : target_scale(target_scale_), name(name_) - { - } + FunctionFromUnixTimestamp64(size_t target_scale_, const char * name_, ContextPtr context) + : target_scale(target_scale_) + , name(name_) + , allow_nonconst_timezone_arguments(context->getSettings().allow_nonconst_timezone_arguments) + {} String getName() const override { return name; } size_t getNumberOfArguments() const override { return 0; } @@ -121,7 +124,7 @@ public: std::string timezone; if (arguments.size() == 2) - timezone = extractTimeZoneNameFromFunctionArguments(arguments, 1, 0); + timezone = extractTimeZoneNameFromFunctionArguments(arguments, 1, 0, allow_nonconst_timezone_arguments); return std::make_shared(target_scale, timezone); } diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 940585d6d57..87229b8ad04 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -1796,13 +1796,13 @@ public: if (to_datetime64 || scale != 0) /// toDateTime('xxxx-xx-xx xx:xx:xx', 0) return DateTime return std::make_shared(scale, - extractTimeZoneNameFromFunctionArguments(arguments, timezone_arg_position, 0)); + extractTimeZoneNameFromFunctionArguments(arguments, timezone_arg_position, 0, false)); - return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, timezone_arg_position, 0)); + return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, timezone_arg_position, 0, false)); } if constexpr (std::is_same_v) - return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, timezone_arg_position, 0)); + return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, timezone_arg_position, 0, false)); else if constexpr (std::is_same_v) throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected branch in code of conversion function: it is a bug."); else @@ -2067,7 +2067,7 @@ public: UInt64 scale = to_datetime64 ? DataTypeDateTime64::default_scale : 0; if (arguments.size() > 1) scale = extractToDecimalScale(arguments[1]); - const auto timezone = extractTimeZoneNameFromFunctionArguments(arguments, 2, 0); + const auto timezone = extractTimeZoneNameFromFunctionArguments(arguments, 2, 0, false); res = scale == 0 ? res = std::make_shared(timezone) : std::make_shared(scale, timezone); } @@ -2117,7 +2117,7 @@ public: } if constexpr (std::is_same_v) - res = std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 1, 0)); + res = std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 1, 0, false)); else if constexpr (std::is_same_v) throw Exception(ErrorCodes::LOGICAL_ERROR, "LOGICAL ERROR: It is a bug."); else if constexpr (to_decimal) diff --git a/src/Functions/FunctionsTimeWindow.cpp b/src/Functions/FunctionsTimeWindow.cpp index 8a57a4da692..231e8b6fa77 100644 --- a/src/Functions/FunctionsTimeWindow.cpp +++ b/src/Functions/FunctionsTimeWindow.cpp @@ -138,7 +138,7 @@ struct TimeWindowImpl if (result_type_is_date) data_type = std::make_shared(); else - data_type = std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 2, 0)); + data_type = std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 2, 0, false)); return std::make_shared(DataTypes{data_type, data_type}); } @@ -322,7 +322,7 @@ struct TimeWindowImpl if (result_type_is_date) data_type = std::make_shared(); else - data_type = std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 3, 0)); + data_type = std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 3, 0, false)); return std::make_shared(DataTypes{data_type, data_type}); } diff --git a/src/Functions/date_trunc.cpp b/src/Functions/date_trunc.cpp index 016b8f4da5e..414512fc4f8 100644 --- a/src/Functions/date_trunc.cpp +++ b/src/Functions/date_trunc.cpp @@ -107,7 +107,7 @@ public: if (result_type_is_date) return std::make_shared(); else - return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 2, 1)); + return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 2, 1, false)); } bool useDefaultImplementationForConstants() const override { return true; } diff --git a/src/Functions/extractTimeZoneFromFunctionArguments.cpp b/src/Functions/extractTimeZoneFromFunctionArguments.cpp index 7ed240fdbcf..7168c68c9c9 100644 --- a/src/Functions/extractTimeZoneFromFunctionArguments.cpp +++ b/src/Functions/extractTimeZoneFromFunctionArguments.cpp @@ -30,10 +30,11 @@ std::string extractTimeZoneNameFromColumn(const IColumn * column, const String & } -std::string extractTimeZoneNameFromFunctionArguments(const ColumnsWithTypeAndName & arguments, size_t time_zone_arg_num, size_t datetime_arg_num) +std::string extractTimeZoneNameFromFunctionArguments(const ColumnsWithTypeAndName & arguments, size_t time_zone_arg_num, size_t datetime_arg_num, bool allow_nonconst_timezone_arguments) { /// Explicit time zone may be passed in last argument. - if (arguments.size() == time_zone_arg_num + 1) + if ((arguments.size() == time_zone_arg_num + 1) + && (!allow_nonconst_timezone_arguments || arguments[time_zone_arg_num].column)) { return extractTimeZoneNameFromColumn(arguments[time_zone_arg_num].column.get(), arguments[time_zone_arg_num].name); } diff --git a/src/Functions/extractTimeZoneFromFunctionArguments.h b/src/Functions/extractTimeZoneFromFunctionArguments.h index 858be40def7..3c012c64c49 100644 --- a/src/Functions/extractTimeZoneFromFunctionArguments.h +++ b/src/Functions/extractTimeZoneFromFunctionArguments.h @@ -16,8 +16,16 @@ std::string extractTimeZoneNameFromColumn(const IColumn * column, const String & /// Determine working timezone either from optional argument with time zone name or from time zone in DateTime type of argument. /// Returns empty string if default time zone should be used. +/// +/// Parameter allow_nonconst_timezone_arguments toggles if non-const timezone function arguments are accepted (legacy behavior) or not. The +/// problem with the old behavior is that the timezone is part of the type, and not part of the value. This lead to confusion and unexpected +/// results. +/// - For new functions, set allow_nonconst_timezone_arguments = false. +/// - For existing functions +/// - which disallow non-const timezone arguments anyways (e.g. getArgumentsThatAreAlwaysConstant()), set allow_nonconst_timezone_arguments = false, +/// - which allow non-const timezone arguments, set allow_nonconst_timezone_arguments according to the corresponding setting. std::string extractTimeZoneNameFromFunctionArguments( - const ColumnsWithTypeAndName & arguments, size_t time_zone_arg_num, size_t datetime_arg_num); + const ColumnsWithTypeAndName & arguments, size_t time_zone_arg_num, size_t datetime_arg_num, bool allow_nonconst_timezone_arguments); const DateLUTImpl & extractTimeZoneFromFunctionArguments( const ColumnsWithTypeAndName & arguments, size_t time_zone_arg_num, size_t datetime_arg_num); diff --git a/src/Functions/fromUnixTimestamp64Micro.cpp b/src/Functions/fromUnixTimestamp64Micro.cpp index 70dcbcd1d4b..191e2137a0d 100644 --- a/src/Functions/fromUnixTimestamp64Micro.cpp +++ b/src/Functions/fromUnixTimestamp64Micro.cpp @@ -7,8 +7,8 @@ namespace DB REGISTER_FUNCTION(FromUnixTimestamp64Micro) { factory.registerFunction("fromUnixTimestamp64Micro", - [](ContextPtr){ return std::make_unique( - std::make_shared(6, "fromUnixTimestamp64Micro")); }); + [](ContextPtr context){ return std::make_unique( + std::make_shared(6, "fromUnixTimestamp64Micro", context)); }); } } diff --git a/src/Functions/fromUnixTimestamp64Milli.cpp b/src/Functions/fromUnixTimestamp64Milli.cpp index 532013dfe5f..c6d4fcd30a2 100644 --- a/src/Functions/fromUnixTimestamp64Milli.cpp +++ b/src/Functions/fromUnixTimestamp64Milli.cpp @@ -7,8 +7,8 @@ namespace DB REGISTER_FUNCTION(FromUnixTimestamp64Milli) { factory.registerFunction("fromUnixTimestamp64Milli", - [](ContextPtr){ return std::make_unique( - std::make_shared(3, "fromUnixTimestamp64Milli")); }); + [](ContextPtr context){ return std::make_unique( + std::make_shared(3, "fromUnixTimestamp64Milli", context)); }); } } diff --git a/src/Functions/fromUnixTimestamp64Nano.cpp b/src/Functions/fromUnixTimestamp64Nano.cpp index 96afdda0fa8..2b5a7addbfc 100644 --- a/src/Functions/fromUnixTimestamp64Nano.cpp +++ b/src/Functions/fromUnixTimestamp64Nano.cpp @@ -7,8 +7,8 @@ namespace DB REGISTER_FUNCTION(FromUnixTimestamp64Nano) { factory.registerFunction("fromUnixTimestamp64Nano", - [](ContextPtr){ return std::make_unique( - std::make_shared(9, "fromUnixTimestamp64Nano")); }); + [](ContextPtr context){ return std::make_unique( + std::make_shared(9, "fromUnixTimestamp64Nano", context)); }); } } diff --git a/src/Functions/now.cpp b/src/Functions/now.cpp index 3c3bff1524f..d3a94379a61 100644 --- a/src/Functions/now.cpp +++ b/src/Functions/now.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB { @@ -87,7 +88,10 @@ public: bool isVariadic() const override { return true; } size_t getNumberOfArguments() const override { return 0; } - static FunctionOverloadResolverPtr create(ContextPtr) { return std::make_unique(); } + static FunctionOverloadResolverPtr create(ContextPtr context) { return std::make_unique(context); } + explicit NowOverloadResolver(ContextPtr context) + : allow_nonconst_timezone_arguments(context->getSettings().allow_nonconst_timezone_arguments) + {} DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { @@ -102,7 +106,7 @@ public: } if (arguments.size() == 1) { - return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 0, 0)); + return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 0, 0, allow_nonconst_timezone_arguments)); } return std::make_shared(); } @@ -121,10 +125,12 @@ public: if (arguments.size() == 1) return std::make_unique( time(nullptr), DataTypes{arguments.front().type}, - std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 0, 0))); + std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 0, 0, allow_nonconst_timezone_arguments))); return std::make_unique(time(nullptr), DataTypes(), std::make_shared()); } +private: + const bool allow_nonconst_timezone_arguments; }; } diff --git a/src/Functions/now64.cpp b/src/Functions/now64.cpp index f29b73061d9..349b8c71145 100644 --- a/src/Functions/now64.cpp +++ b/src/Functions/now64.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -115,7 +116,10 @@ public: bool isVariadic() const override { return true; } size_t getNumberOfArguments() const override { return 0; } - static FunctionOverloadResolverPtr create(ContextPtr) { return std::make_unique(); } + static FunctionOverloadResolverPtr create(ContextPtr context) { return std::make_unique(context); } + explicit Now64OverloadResolver(ContextPtr context) + : allow_nonconst_timezone_arguments(context->getSettings().allow_nonconst_timezone_arguments) + {} DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { @@ -137,7 +141,7 @@ public: } if (arguments.size() == 2) { - timezone_name = extractTimeZoneNameFromFunctionArguments(arguments, 1, 0); + timezone_name = extractTimeZoneNameFromFunctionArguments(arguments, 1, 0, allow_nonconst_timezone_arguments); } return std::make_shared(scale, timezone_name); @@ -157,6 +161,8 @@ public: return std::make_unique(nowSubsecond(scale), std::move(arg_types), result_type); } +private: + const bool allow_nonconst_timezone_arguments; }; } diff --git a/src/Functions/nowInBlock.cpp b/src/Functions/nowInBlock.cpp index dfb3ed7c34a..0d5f9c45780 100644 --- a/src/Functions/nowInBlock.cpp +++ b/src/Functions/nowInBlock.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB @@ -25,10 +26,13 @@ class FunctionNowInBlock : public IFunction { public: static constexpr auto name = "nowInBlock"; - static FunctionPtr create(ContextPtr) + static FunctionPtr create(ContextPtr context) { - return std::make_shared(); + return std::make_shared(context); } + explicit FunctionNowInBlock(ContextPtr context) + : allow_nonconst_timezone_arguments(context->getSettings().allow_nonconst_timezone_arguments) + {} String getName() const override { @@ -68,7 +72,7 @@ public: } if (arguments.size() == 1) { - return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 0, 0)); + return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 0, 0, allow_nonconst_timezone_arguments)); } return std::make_shared(); } @@ -77,6 +81,8 @@ public: { return ColumnDateTime::create(input_rows_count, static_cast(time(nullptr))); } +private: + const bool allow_nonconst_timezone_arguments; }; } diff --git a/src/Functions/snowflake.cpp b/src/Functions/snowflake.cpp index 4849d6512ca..ca78945acb9 100644 --- a/src/Functions/snowflake.cpp +++ b/src/Functions/snowflake.cpp @@ -21,14 +21,14 @@ REGISTER_FUNCTION(DateTime64ToSnowflake) REGISTER_FUNCTION(SnowflakeToDateTime) { factory.registerFunction("snowflakeToDateTime", - [](ContextPtr){ return std::make_unique( - std::make_shared("snowflakeToDateTime")); }); + [](ContextPtr context ){ return std::make_unique( + std::make_shared("snowflakeToDateTime", context)); }); } REGISTER_FUNCTION(SnowflakeToDateTime64) { factory.registerFunction("snowflakeToDateTime64", - [](ContextPtr){ return std::make_unique( - std::make_shared("snowflakeToDateTime64")); }); + [](ContextPtr context){ return std::make_unique( + std::make_shared("snowflakeToDateTime64", context)); }); } } diff --git a/src/Functions/timeSlots.cpp b/src/Functions/timeSlots.cpp index 568ab5e5a47..040495ab023 100644 --- a/src/Functions/timeSlots.cpp +++ b/src/Functions/timeSlots.cpp @@ -270,14 +270,14 @@ public: /// Note that there is no explicit time zone argument for this function (we specify 2 as an argument number with explicit time zone). if (WhichDataType(arguments[0].type).isDateTime()) { - return std::make_shared(std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 3, 0))); + return std::make_shared(std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 3, 0, false))); } else { auto start_time_scale = assert_cast(*arguments[0].type).getScale(); auto duration_scale = assert_cast(*arguments[1].type).getScale(); return std::make_shared( - std::make_shared(std::max(start_time_scale, duration_scale), extractTimeZoneNameFromFunctionArguments(arguments, 3, 0))); + std::make_shared(std::max(start_time_scale, duration_scale), extractTimeZoneNameFromFunctionArguments(arguments, 3, 0, false))); } } diff --git a/src/Functions/toStartOfInterval.cpp b/src/Functions/toStartOfInterval.cpp index c0220f1aed2..649242d0d86 100644 --- a/src/Functions/toStartOfInterval.cpp +++ b/src/Functions/toStartOfInterval.cpp @@ -384,7 +384,7 @@ public: if (result_type_is_date) return std::make_shared(); else if (result_type_is_datetime) - return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 2, 0)); + return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 2, 0, false)); else { auto scale = 0; @@ -396,7 +396,7 @@ public: else if (interval_type->getKind() == IntervalKind::Millisecond) scale = 3; - return std::make_shared(scale, extractTimeZoneNameFromFunctionArguments(arguments, 2, 0)); + return std::make_shared(scale, extractTimeZoneNameFromFunctionArguments(arguments, 2, 0, false)); } } diff --git a/src/Functions/toTimezone.cpp b/src/Functions/toTimezone.cpp index 0a54e5a86b7..a0d90351898 100644 --- a/src/Functions/toTimezone.cpp +++ b/src/Functions/toTimezone.cpp @@ -5,6 +5,8 @@ #include #include +#include + #include #include @@ -84,7 +86,10 @@ public: String getName() const override { return name; } size_t getNumberOfArguments() const override { return 2; } - static FunctionOverloadResolverPtr create(ContextPtr) { return std::make_unique(); } + static FunctionOverloadResolverPtr create(ContextPtr context) { return std::make_unique(context); } + explicit ToTimeZoneOverloadResolver(ContextPtr context) + : allow_nonconst_timezone_arguments(context->getSettings().allow_nonconst_timezone_arguments) + {} DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { @@ -98,7 +103,7 @@ public: throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}. " "Should be DateTime or DateTime64", arguments[0].type->getName(), getName()); - String time_zone_name = extractTimeZoneNameFromFunctionArguments(arguments, 1, 0); + String time_zone_name = extractTimeZoneNameFromFunctionArguments(arguments, 1, 0, allow_nonconst_timezone_arguments); if (which_type.isDateTime()) return std::make_shared(time_zone_name); @@ -119,6 +124,8 @@ public: return std::make_unique(is_constant_timezone, data_types, result_type); } +private: + const bool allow_nonconst_timezone_arguments; }; } diff --git a/tests/queries/0_stateless/00515_enhanced_time_zones.reference b/tests/queries/0_stateless/00515_enhanced_time_zones.reference index ad0dae35c45..2ee2c3eac81 100644 --- a/tests/queries/0_stateless/00515_enhanced_time_zones.reference +++ b/tests/queries/0_stateless/00515_enhanced_time_zones.reference @@ -16,3 +16,10 @@ 2017-11-05 08:07:47 2017-11-05 10:37:47 2017-11-05 10:37:47 +-- Test const/non-const timezone arguments -- +Asia/Kolkata 2017-11-05 08:07:47 +42 Asia/Kolkata 1970-01-01 00:00:00.042 +42 Asia/Kolkata 1970-01-01 00:00:00.000042 +42 Asia/Kolkata 1970-01-01 00:00:00.000000042 +42 Asia/Kolkata 2010-11-04 01:42:54 +42 Asia/Kolkata 2010-11-04 01:42:54.657 diff --git a/tests/queries/0_stateless/00515_enhanced_time_zones.sql b/tests/queries/0_stateless/00515_enhanced_time_zones.sql index f719ff70d7a..7659b6e4603 100644 --- a/tests/queries/0_stateless/00515_enhanced_time_zones.sql +++ b/tests/queries/0_stateless/00515_enhanced_time_zones.sql @@ -21,16 +21,46 @@ SELECT toString(toDateTime('2017-11-05 08:07:47', 'Asia/Istanbul')); SELECT toString(toTimeZone(toDateTime('2017-11-05 08:07:47', 'Asia/Istanbul'), 'Asia/Kolkata')); SELECT toString(toDateTime('2017-11-05 08:07:47', 'Asia/Istanbul'), 'Asia/Kolkata'); -SELECT toTimeZone(dt, tz) FROM ( - SELECT toDateTime('2017-11-05 08:07:47', 'Asia/Istanbul') AS dt, arrayJoin(['Asia/Kolkata', 'UTC']) AS tz -); -- { serverError ILLEGAL_COLUMN } -SELECT materialize('Asia/Kolkata') t, toTimeZone(toDateTime('2017-11-05 08:07:47', 'Asia/Istanbul'), t); -- { serverError ILLEGAL_COLUMN } +SELECT '-- Test const/non-const timezone arguments --'; -CREATE TEMPORARY TABLE tmp AS SELECT arrayJoin(['Europe/Istanbul', 'Asia/Istanbul']); -SELECT toTimeZone(now(), (*,).1) FROM tmp; -- { serverError ILLEGAL_COLUMN } -SELECT now((*,).1) FROM tmp; -- { serverError ILLEGAL_COLUMN } -SELECT now64(1, (*,).1) FROM tmp; -- { serverError ILLEGAL_COLUMN } -SELECT toStartOfInterval(now(), INTERVAL 3 HOUR, (*,).1) FROM tmp; -- { serverError ILLEGAL_COLUMN } -SELECT snowflakeToDateTime(toInt64(123), (*,).1) FROM tmp; -- { serverError ILLEGAL_COLUMN } -SELECT toUnixTimestamp(now(), (*,).1) FROM tmp; -- { serverError ILLEGAL_COLUMN } -SELECT toDateTimeOrDefault('2023-04-12 16:43:32', (*,).1, now()) FROM tmp; -- { serverError ILLEGAL_COLUMN } +SELECT materialize('Asia/Kolkata') tz, toTimeZone(toDateTime('2017-11-05 08:07:47', 'Asia/Istanbul'), tz) SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +SELECT materialize('Asia/Kolkata') tz, toTimeZone(toDateTime('2017-11-05 08:07:47', 'Asia/Istanbul'), tz) SETTINGS allow_nonconst_timezone_arguments = 1; + +SELECT materialize('Asia/Kolkata') tz, now(tz) SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +-- SELECT materialize('Asia/Kolkata') tz, now(tz) SETTINGS allow_nonconst_timezone_arguments = 1; + +SELECT materialize('Asia/Kolkata') tz, now64(9, tz) SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +-- SELECT materialize('Asia/Kolkata') tz, now64(9, tz) SETTINGS allow_nonconst_timezone_arguments = 1; + +SELECT materialize('Asia/Kolkata') tz, nowInBlock(tz) SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +-- SELECT materialize('Asia/Kolkata') tz, nowInBlock(tz) SETTINGS allow_nonconst_timezone_arguments = 1; + +SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, fromUnixTimestamp64Milli(ts, tz) SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, fromUnixTimestamp64Milli(ts, tz) SETTINGS allow_nonconst_timezone_arguments = 1; + +SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, fromUnixTimestamp64Micro(ts, tz) SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, fromUnixTimestamp64Micro(ts, tz) SETTINGS allow_nonconst_timezone_arguments = 1; + +SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, fromUnixTimestamp64Nano(ts, tz) SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, fromUnixTimestamp64Nano(ts, tz) SETTINGS allow_nonconst_timezone_arguments = 1; + +SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, snowflakeToDateTime(ts, tz) settings allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, snowflakeToDateTime(ts, tz) settings allow_nonconst_timezone_arguments = 1; + +SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, snowflakeToDateTime64(ts, tz) settings allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, snowflakeToDateTime64(ts, tz) settings allow_nonconst_timezone_arguments = 1; + +-- test for a related bug: + +DROP TABLE IF EXISTS tab; + +SET allow_nonconst_timezone_arguments = 1; + +CREATE TABLE tab (`country` LowCardinality(FixedString(7)) DEFAULT 'unknown', `city` LowCardinality(String) DEFAULT 'unknown', `region` LowCardinality(String) DEFAULT 'unknown', `continent` LowCardinality(FixedString(7)) DEFAULT 'unknown', `is_eu_country` Bool, `date` DateTime CODEC(DoubleDelta, LZ4), `viewer_date` DateTime ALIAS toTimezone(date, timezone), `device_browser` LowCardinality(String) DEFAULT 'unknown', `metro_code` LowCardinality(String) DEFAULT 'unknown', `domain` String DEFAULT 'unknown', `device_platform` LowCardinality(String) DEFAULT 'unknown', `device_type` LowCardinality(String) DEFAULT 'unknown', `device_vendor` LowCardinality(String) DEFAULT 'unknown', `ip` FixedString(39) DEFAULT 'unknown', `lat` Decimal(8, 6) CODEC(T64), `lng` Decimal(9, 6) CODEC(T64), `asset_id` String DEFAULT 'unknown', `is_personalized` Bool, `metric` String, `origin` String DEFAULT 'unknown', `product_id` UInt64 CODEC(T64), `referer` String DEFAULT 'unknown', `server_side` Int8 CODEC(T64), `third_party_id` String DEFAULT 'unknown', `partner_slug` LowCardinality(FixedString(10)) DEFAULT 'unknown', `user_agent` String DEFAULT 'unknown', `user_id` UUID, `zip` FixedString(10) DEFAULT 'unknown', `timezone` LowCardinality(String), `as_organization` LowCardinality(String) DEFAULT 'unknown', `content_cat` Array(String), `playback_method` LowCardinality(String) DEFAULT 'unknown', `store_id` LowCardinality(String) DEFAULT 'unknown', `store_url` String DEFAULT 'unknown', `timestamp` Nullable(DateTime), `ad_count` Int8 CODEC(T64), `ad_type` LowCardinality(FixedString(10)) DEFAULT 'unknown', `ad_categories` Array(FixedString(8)), `blocked_ad_categories` Array(FixedString(8)), `break_max_ad_length` Int8 CODEC(T64), `break_max_ads` Int8 CODEC(T64), `break_max_duration` Int8 CODEC(T64), `break_min_ad_length` Int8 CODEC(T64), `break_position` LowCardinality(FixedString(18)) DEFAULT 'unknown', `media_playhead` String DEFAULT 'unknown', `placement_type` Int8 CODEC(T64), `transaction_id` String, `universal_ad_id` Array(String), `client_ua` LowCardinality(String) DEFAULT 'unknown', `device_ip` FixedString(39) DEFAULT 'unknown', `device_ua` LowCardinality(String) DEFAULT 'unknown', `ifa` String, `ifa_type` LowCardinality(String) DEFAULT 'unknown', `vast_lat` Decimal(8, 6) CODEC(T64), `vast_long` Decimal(9, 6) CODEC(T64), `server_ua` String DEFAULT 'unknown', `app_bundle` String DEFAULT 'unknown', `page_url` String DEFAULT 'unknown', `api_framework` Array(UInt8), `click_type` LowCardinality(String), `extensions` Array(String), `media_mime` Array(String), `om_id_partner` LowCardinality(String) DEFAULT 'unknown', `player_capabilities` Array(FixedString(12)), `vast_versions` Array(UInt8), `verification_vendors` Array(String), `ad_play_head` String DEFAULT 'unknown', `ad_serving_id` String DEFAULT 'unknown', `asset_uri` String DEFAULT 'unknown', `content_id` String DEFAULT 'unknown', `content_uri` String DEFAULT 'unknown', `inventory_state` Array(FixedString(14)), `player_size` Array(UInt8), `player_state` Array(FixedString(12)), `pod_sequence` Int8 CODEC(T64), `click_position` Array(UInt32), `error_code` Int16 CODEC(T64), `error_reason` Int8 CODEC(T64), `gdpr_consent` String DEFAULT 'unknown', `limited_tracking` Bool, `regulations` String DEFAULT 'unknown', `content_category` Array(String), PROJECTION projection_TPAG_VAST_date (SELECT * ORDER BY toYYYYMMDD(date), metric, product_id, asset_id)) ENGINE = MergeTree ORDER BY (product_id, metric, asset_id, toYYYYMMDD(date)); + +DETACH TABLE tab; + +ATTACH TABLE tab SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +ATTACH TABLE tab SETTINGS allow_nonconst_timezone_arguments = 1; + +DROP TABLE tab; From ffb941624bc971886212e0745716e79688a154a1 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sat, 10 Jun 2023 17:01:44 +0000 Subject: [PATCH 0748/1072] Exclude some tests with QPL from fasttest --- .../0_stateless/00804_test_alter_compression_codecs.sql | 3 +++ .../0_stateless/00804_test_custom_compression_codecs.sql | 3 +++ .../00804_test_custom_compression_codes_log_storages.sql | 3 +++ .../0_stateless/00804_test_deflate_qpl_codec_compression.sql | 3 +++ 4 files changed, 12 insertions(+) diff --git a/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql b/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql index fd9855e82d3..eb1abda9a21 100644 --- a/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql +++ b/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql @@ -1,3 +1,6 @@ +--Tags: no-fasttest +-- no-fasttest because DEFLATE_QPL isn't available in fasttest + SET send_logs_level = 'fatal'; DROP TABLE IF EXISTS alter_compression_codec; diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql b/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql index 89e77f758a7..df74620a201 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql +++ b/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql @@ -1,3 +1,6 @@ +--Tags: no-fasttest +-- no-fasttest because DEFLATE_QPL isn't available in fasttest + SET send_logs_level = 'fatal'; SET allow_suspicious_codecs = 1; SET enable_deflate_qpl_codec = 1; diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql b/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql index a629df2666d..67c0074c58f 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql +++ b/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql @@ -1,3 +1,6 @@ +--Tags: no-fasttest +-- no-fasttest because DEFLATE_QPL isn't available in fasttest + SET send_logs_level = 'fatal'; SET allow_suspicious_codecs = 1; SET enable_deflate_qpl_codec = 1; diff --git a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql index 5a56fc0d576..a46272112a9 100644 --- a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql +++ b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql @@ -1,3 +1,6 @@ +--Tags: no-fasttest +-- no-fasttest because DEFLATE_QPL isn't available in fasttest + SET send_logs_level = 'fatal'; SET enable_deflate_qpl_codec = 1; From a3da7c8ebe33977cacf6f67f5b1a75833de9aa64 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Sat, 10 Jun 2023 17:20:29 +0000 Subject: [PATCH 0749/1072] Merged NuRaft --- contrib/NuRaft | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/NuRaft b/contrib/NuRaft index 8f267da1a91..491eaf592d9 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit 8f267da1a91310bd152af755b0178cfd38c646c7 +Subproject commit 491eaf592d950e0e37accbe8b3f217e068c9fecf From 296b11a1aeaa069b56fb2befd3b933c984b20a1b Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Sat, 10 Jun 2023 20:56:54 +0000 Subject: [PATCH 0750/1072] Update version_date.tsv and changelogs after v23.5.2.7-stable --- docker/keeper/Dockerfile | 2 +- docker/server/Dockerfile.alpine | 2 +- docker/server/Dockerfile.ubuntu | 2 +- docs/changelogs/v23.5.2.7-stable.md | 18 ++++++++++++++++++ utils/list-versions/version_date.tsv | 1 + 5 files changed, 22 insertions(+), 3 deletions(-) create mode 100644 docs/changelogs/v23.5.2.7-stable.md diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile index 7190ef4d649..44967af4b32 100644 --- a/docker/keeper/Dockerfile +++ b/docker/keeper/Dockerfile @@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \ esac ARG REPOSITORY="https://s3.amazonaws.com/clickhouse-builds/22.4/31c367d3cd3aefd316778601ff6565119fe36682/package_release" -ARG VERSION="23.5.1.3174" +ARG VERSION="23.5.2.7" ARG PACKAGES="clickhouse-keeper" # user/group precreated explicitly with fixed uid/gid on purpose. diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index ca966b16a2d..8ab9bf7b077 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -33,7 +33,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="23.5.1.3174" +ARG VERSION="23.5.2.7" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # user/group precreated explicitly with fixed uid/gid on purpose. diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index c82ac592120..b3b0cfe1510 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -22,7 +22,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list ARG REPO_CHANNEL="stable" ARG REPOSITORY="deb https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" -ARG VERSION="23.5.1.3174" +ARG VERSION="23.5.2.7" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # set non-empty deb_location_url url to create a docker image diff --git a/docs/changelogs/v23.5.2.7-stable.md b/docs/changelogs/v23.5.2.7-stable.md new file mode 100644 index 00000000000..2e4931c64e0 --- /dev/null +++ b/docs/changelogs/v23.5.2.7-stable.md @@ -0,0 +1,18 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.5.2.7-stable (5751aa1ab9f) FIXME as compared to v23.5.1.3174-stable (2fec796e73e) + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Do not read all the columns from right GLOBAL JOIN table. [#50721](https://github.com/ClickHouse/ClickHouse/pull/50721) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Fix build for aarch64 (temporary disable azure) [#50770](https://github.com/ClickHouse/ClickHouse/pull/50770) ([alesapin](https://github.com/alesapin)). +* Rename azure_blob_storage to azureBlobStorage [#50812](https://github.com/ClickHouse/ClickHouse/pull/50812) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). + diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 2aeeb5db35c..4647bcb4af1 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,3 +1,4 @@ +v23.5.2.7-stable 2023-06-10 v23.5.1.3174-stable 2023-06-09 v23.4.2.11-stable 2023-05-02 v23.4.1.1943-stable 2023-04-27 From 78c32a204ce656c278433fed92fb535584b8ee3b Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sat, 10 Jun 2023 23:00:59 +0200 Subject: [PATCH 0751/1072] Updated docs for azureBlobStorage Table function & engine --- .../integrations/azureBlobStorage.md | 28 ++++++++ .../table-functions/azureBlobStorage.md | 70 +++++++++++++++++++ .../table-functions/azure_blob_storage.md | 11 --- 3 files changed, 98 insertions(+), 11 deletions(-) create mode 100644 docs/en/engines/table-engines/integrations/azureBlobStorage.md create mode 100644 docs/en/sql-reference/table-functions/azureBlobStorage.md delete mode 100644 docs/en/sql-reference/table-functions/azure_blob_storage.md diff --git a/docs/en/engines/table-engines/integrations/azureBlobStorage.md b/docs/en/engines/table-engines/integrations/azureBlobStorage.md new file mode 100644 index 00000000000..b1c6169592b --- /dev/null +++ b/docs/en/engines/table-engines/integrations/azureBlobStorage.md @@ -0,0 +1,28 @@ +--- +slug: /en/engines/table-engines/integrations/azureBlobStorage +sidebar_position: 7 +sidebar_label: AzureBlobStorage +--- + +# AzureBlobStorage Table Engine + +This engine provides integration with [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs) ecosystem. + +## Create Table {#creating-a-table} + +``` sql +CREATE TABLE azure_blob_storage_table (name String, value UInt32) + ENGINE = AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression]) + [PARTITION BY expr] + [SETTINGS ...] +``` + +**Engine parameters** + +- `connection_string|storage_account_url` — connection_string includes account name & key ([Create connection string](https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string?toc=%2Fazure%2Fstorage%2Fblobs%2Ftoc.json&bc=%2Fazure%2Fstorage%2Fblobs%2Fbreadcrumb%2Ftoc.json#configure-a-connection-string-for-an-azure-storage-account)) or you could also provide the storage account url here and account name & account key as separate parameters (see parameters account_name & account_key) +- `container_name` - Container name +- `blobpath` - file path. Supports following wildcards in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc'`, `'def'` — strings. +- `account_name` - if storage_account_url is used, then account name can be specified here +- `account_key` - if storage_account_url is used, then account key can be specified here +- `format` — The [format](../../interfaces/formats.md#formats) of the file. +- `compression` — Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. By default, it will autodetect compression by file extension. (same as setting to `auto`). diff --git a/docs/en/sql-reference/table-functions/azureBlobStorage.md b/docs/en/sql-reference/table-functions/azureBlobStorage.md new file mode 100644 index 00000000000..f8a9016bd15 --- /dev/null +++ b/docs/en/sql-reference/table-functions/azureBlobStorage.md @@ -0,0 +1,70 @@ +--- +slug: /en/sql-reference/table-functions/azure_blob_storage +sidebar_position: 45 +sidebar_label: azure_blob_storage +keywords: [azure blob storage] +--- + +# azure\_blob\_storage Table Function + +Provides a table-like interface to select/insert files in [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs). This table function is similar to the [s3 function](../../sql-reference/table-functions/s3.md). + +**Syntax** + +``` sql +azureBlobStorage(- connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure]) +``` + +**Arguments** + +- `connection_string|storage_account_url` — connection_string includes account name & key ([Create connection string](https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string?toc=%2Fazure%2Fstorage%2Fblobs%2Ftoc.json&bc=%2Fazure%2Fstorage%2Fblobs%2Fbreadcrumb%2Ftoc.json#configure-a-connection-string-for-an-azure-storage-account)) or you could also provide the storage account url here and account name & account key as separate parameters (see parameters account_name & account_key) +- `container_name` - Container name +- `blobpath` - file path. Supports following wildcards in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc'`, `'def'` — strings. +- `account_name` - if storage_account_url is used, then account name can be specified here +- `account_key` - if storage_account_url is used, then account key can be specified here +- `format` — The [format](../../interfaces/formats.md#formats) of the file. +- `compression` — Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. By default, it will autodetect compression by file extension. (same as setting to `auto`). +- `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. + +**Returned value** + +A table with the specified structure for reading or writing data in the specified file. + +**Examples** + +Write data into azure blob storage using the following : + +```sql +INSERT INTO TABLE FUNCTION azureBlobStorage('http://azurite1:10000/devstoreaccount1', + 'test_container', 'test_{_partition_id}.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', + 'CSV', 'auto', 'column1 UInt32, column2 UInt32, column3 UInt32') PARTITION BY column3 VALUES (1, 2, 3), (3, 2, 1), (78, 43, 3); +``` + +And then it can be read using + +```sql +SELECT * FROM azureBlobStorage('http://azurite1:10000/devstoreaccount1', + 'test_container', 'test_1.csv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', + 'CSV', 'auto', 'column1 UInt32, column2 UInt32, column3 UInt32'); +``` + +```response +┌───column1─┬────column2─┬───column3─┐ +│ 3 │ 2 │ 1 │ +└───────────┴────────────┴───────────┘ +``` + +or with storage_account_url + +```sql +SELECT count(*) FROM azureBlobStorage('DefaultEndpointsProtocol=https;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;EndPointSuffix=core.windows.net', + 'test_container', 'test_3.csv', 'CSV', 'auto' , 'column1 UInt32, column2 UInt32, column3 UInt32'); +``` + +``` text +┌─count()─┐ +│ 2 │ +└─────────┘ +``` + + \ No newline at end of file diff --git a/docs/en/sql-reference/table-functions/azure_blob_storage.md b/docs/en/sql-reference/table-functions/azure_blob_storage.md deleted file mode 100644 index 6091aab5f9d..00000000000 --- a/docs/en/sql-reference/table-functions/azure_blob_storage.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -slug: /en/sql-reference/table-functions/azure_blob_storage -sidebar_position: 45 -sidebar_label: azure_blob_storage -keywords: [azure blob storage] ---- - -# azureBlobStorage Table Function - -Provides a table-like interface to select/insert files in [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs). This table function is similar to the [s3 function](../../sql-reference/table-functions/s3.md). - From 05d4baf1e7186c902c9e44fe0f16e9cbbc18e5c0 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Sat, 10 Jun 2023 18:20:39 -0400 Subject: [PATCH 0752/1072] edits --- .../integrations/azureBlobStorage.md | 15 +++++++------ .../table-engines/integrations/index.md | 21 +------------------ .../table-functions/azureBlobStorage.md | 5 +++-- 3 files changed, 13 insertions(+), 28 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/azureBlobStorage.md b/docs/en/engines/table-engines/integrations/azureBlobStorage.md index b1c6169592b..82b26e79579 100644 --- a/docs/en/engines/table-engines/integrations/azureBlobStorage.md +++ b/docs/en/engines/table-engines/integrations/azureBlobStorage.md @@ -1,14 +1,13 @@ --- slug: /en/engines/table-engines/integrations/azureBlobStorage -sidebar_position: 7 -sidebar_label: AzureBlobStorage +sidebar_label: Azure Blob Storage --- # AzureBlobStorage Table Engine -This engine provides integration with [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs) ecosystem. +This engine provides an integration with [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs) ecosystem. -## Create Table {#creating-a-table} +## Create Table ``` sql CREATE TABLE azure_blob_storage_table (name String, value UInt32) @@ -17,12 +16,16 @@ CREATE TABLE azure_blob_storage_table (name String, value UInt32) [SETTINGS ...] ``` -**Engine parameters** +### Engine parameters - `connection_string|storage_account_url` — connection_string includes account name & key ([Create connection string](https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string?toc=%2Fazure%2Fstorage%2Fblobs%2Ftoc.json&bc=%2Fazure%2Fstorage%2Fblobs%2Fbreadcrumb%2Ftoc.json#configure-a-connection-string-for-an-azure-storage-account)) or you could also provide the storage account url here and account name & account key as separate parameters (see parameters account_name & account_key) - `container_name` - Container name - `blobpath` - file path. Supports following wildcards in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc'`, `'def'` — strings. - `account_name` - if storage_account_url is used, then account name can be specified here - `account_key` - if storage_account_url is used, then account key can be specified here -- `format` — The [format](../../interfaces/formats.md#formats) of the file. +- `format` — The [format](/docs/en/interfaces/formats.md) of the file. - `compression` — Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. By default, it will autodetect compression by file extension. (same as setting to `auto`). + +## See also + +[Azure Blob Storage Table Function](/docs/en/sql-reference/table-functions/azureBlobStorage.md) diff --git a/docs/en/engines/table-engines/integrations/index.md b/docs/en/engines/table-engines/integrations/index.md index b321a644d32..b9171d9033b 100644 --- a/docs/en/engines/table-engines/integrations/index.md +++ b/docs/en/engines/table-engines/integrations/index.md @@ -6,24 +6,5 @@ sidebar_label: Integrations # Table Engines for Integrations -ClickHouse provides various means for integrating with external systems, including table engines. Like with all other table engines, the configuration is done using `CREATE TABLE` or `ALTER TABLE` queries. Then from a user perspective, the configured integration looks like a normal table, but queries to it are proxied to the external system. This transparent querying is one of the key advantages of this approach over alternative integration methods, like dictionaries or table functions, which require to use custom query methods on each use. +ClickHouse provides various means for integrating with external systems, including table engines. Like with all other table engines, the configuration is done using `CREATE TABLE` or `ALTER TABLE` queries. Then from a user perspective, the configured integration looks like a normal table, but queries to it are proxied to the external system. This transparent querying is one of the key advantages of this approach over alternative integration methods, like dictionaries or table functions, which require the use of custom query methods on each use. -List of supported integrations: - -- [ODBC](../../../engines/table-engines/integrations/odbc.md) -- [JDBC](../../../engines/table-engines/integrations/jdbc.md) -- [MySQL](../../../engines/table-engines/integrations/mysql.md) -- [MongoDB](../../../engines/table-engines/integrations/mongodb.md) -- [HDFS](../../../engines/table-engines/integrations/hdfs.md) -- [S3](../../../engines/table-engines/integrations/s3.md) -- [Kafka](../../../engines/table-engines/integrations/kafka.md) -- [EmbeddedRocksDB](../../../engines/table-engines/integrations/embedded-rocksdb.md) -- [RabbitMQ](../../../engines/table-engines/integrations/rabbitmq.md) -- [PostgreSQL](../../../engines/table-engines/integrations/postgresql.md) -- [SQLite](../../../engines/table-engines/integrations/sqlite.md) -- [Hive](../../../engines/table-engines/integrations/hive.md) -- [ExternalDistributed](../../../engines/table-engines/integrations/ExternalDistributed.md) -- [MaterializedPostgreSQL](../../../engines/table-engines/integrations/materialized-postgresql.md) -- [NATS](../../../engines/table-engines/integrations/nats.md) -- [DeltaLake](../../../engines/table-engines/integrations/deltalake.md) -- [Hudi](../../../engines/table-engines/integrations/hudi.md) diff --git a/docs/en/sql-reference/table-functions/azureBlobStorage.md b/docs/en/sql-reference/table-functions/azureBlobStorage.md index f8a9016bd15..b79137cb786 100644 --- a/docs/en/sql-reference/table-functions/azureBlobStorage.md +++ b/docs/en/sql-reference/table-functions/azureBlobStorage.md @@ -1,6 +1,5 @@ --- slug: /en/sql-reference/table-functions/azure_blob_storage -sidebar_position: 45 sidebar_label: azure_blob_storage keywords: [azure blob storage] --- @@ -67,4 +66,6 @@ SELECT count(*) FROM azureBlobStorage('DefaultEndpointsProtocol=https;AccountNam └─────────┘ ``` - \ No newline at end of file +**See Also** + +- [AzureBlogStorage Table Engine](/docs/en/engines/table-engines/integrations/azureBlobStorage.md) From 2146c356081fbe1b43da41ad4c739262f1db60c1 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 11 Jun 2023 07:40:39 +0000 Subject: [PATCH 0753/1072] Fix style --- src/Functions/snowflake.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/snowflake.cpp b/src/Functions/snowflake.cpp index ca78945acb9..c18f1c03332 100644 --- a/src/Functions/snowflake.cpp +++ b/src/Functions/snowflake.cpp @@ -21,7 +21,7 @@ REGISTER_FUNCTION(DateTime64ToSnowflake) REGISTER_FUNCTION(SnowflakeToDateTime) { factory.registerFunction("snowflakeToDateTime", - [](ContextPtr context ){ return std::make_unique( + [](ContextPtr context){ return std::make_unique( std::make_shared("snowflakeToDateTime", context)); }); } REGISTER_FUNCTION(SnowflakeToDateTime64) From e6c2a6d13db4604442f84e8e2cacf6bc617fc42e Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sun, 11 Jun 2023 09:55:20 +0200 Subject: [PATCH 0754/1072] Added example for table engine and fixed typos --- .../integrations/azureBlobStorage.md | 20 +++++++++++++++++++ .../table-functions/azureBlobStorage.md | 4 ++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/azureBlobStorage.md b/docs/en/engines/table-engines/integrations/azureBlobStorage.md index 82b26e79579..b8e621fd513 100644 --- a/docs/en/engines/table-engines/integrations/azureBlobStorage.md +++ b/docs/en/engines/table-engines/integrations/azureBlobStorage.md @@ -26,6 +26,26 @@ CREATE TABLE azure_blob_storage_table (name String, value UInt32) - `format` — The [format](/docs/en/interfaces/formats.md) of the file. - `compression` — Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. By default, it will autodetect compression by file extension. (same as setting to `auto`). +**Example** + +``` sql +CREATE TABLE test_table (key UInt64, data String) + ENGINE = AzureBlobStorage('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1/;', + 'test_container', 'test_table', 'CSV'); + +INSERT INTO test_table VALUES (1, 'a'), (2, 'b'), (3, 'c'); + +SELECT * FROM test_table; +``` + +```text +┌─key──┬─data──┐ +│ 1 │ a │ +│ 2 │ b │ +│ 3 │ c │ +└──────┴───────┘ +``` + ## See also [Azure Blob Storage Table Function](/docs/en/sql-reference/table-functions/azureBlobStorage.md) diff --git a/docs/en/sql-reference/table-functions/azureBlobStorage.md b/docs/en/sql-reference/table-functions/azureBlobStorage.md index b79137cb786..369bf7a964d 100644 --- a/docs/en/sql-reference/table-functions/azureBlobStorage.md +++ b/docs/en/sql-reference/table-functions/azureBlobStorage.md @@ -53,7 +53,7 @@ SELECT * FROM azureBlobStorage('http://azurite1:10000/devstoreaccount1', └───────────┴────────────┴───────────┘ ``` -or with storage_account_url +or using connection_string ```sql SELECT count(*) FROM azureBlobStorage('DefaultEndpointsProtocol=https;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;EndPointSuffix=core.windows.net', @@ -68,4 +68,4 @@ SELECT count(*) FROM azureBlobStorage('DefaultEndpointsProtocol=https;AccountNam **See Also** -- [AzureBlogStorage Table Engine](/docs/en/engines/table-engines/integrations/azureBlobStorage.md) +- [AzureBlobStorage Table Engine](/docs/en/engines/table-engines/integrations/azureBlobStorage.md) From e9d539f4bd72d94cef27ed7f1a8a34cd6fa08322 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sun, 11 Jun 2023 10:05:52 +0200 Subject: [PATCH 0755/1072] Updated changelog with azureBlobStorage table function & engine entry --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7142ad26e15..72372c8fac4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ * Setting `enable_memory_bound_merging_of_aggregation_results` is enabled by default. If you update from version prior to 22.12, we recommend to set this flag to `false` until update is finished. [#50319](https://github.com/ClickHouse/ClickHouse/pull/50319) ([Nikita Taranov](https://github.com/nickitat)). #### New Feature +* Added storage engine AzureBlobStorage and azureBlobStorage table function. The supported set of features is very similar to storage/table function S3 [#50604] (https://github.com/ClickHouse/ClickHouse/pull/50604) ([alesapin](https://github.com/alesapin)) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni). * Added native ClickHouse Keeper CLI Client, it is available as `clickhouse keeper-client` [#47414](https://github.com/ClickHouse/ClickHouse/pull/47414) ([pufit](https://github.com/pufit)). * Add `urlCluster` table function. Refactor all *Cluster table functions to reduce code duplication. Make schema inference work for all possible *Cluster function signatures and for named collections. Closes [#38499](https://github.com/ClickHouse/ClickHouse/issues/38499). [#45427](https://github.com/ClickHouse/ClickHouse/pull/45427) ([attack204](https://github.com/attack204)), Pavel Kruglov. * The query cache can now be used for production workloads. [#47977](https://github.com/ClickHouse/ClickHouse/pull/47977) ([Robert Schulze](https://github.com/rschu1ze)). The query cache can now support queries with totals and extremes modifier. [#48853](https://github.com/ClickHouse/ClickHouse/pull/48853) ([Robert Schulze](https://github.com/rschu1ze)). Make `allow_experimental_query_cache` setting as obsolete for backward-compatibility. It was removed in https://github.com/ClickHouse/ClickHouse/pull/47977. [#49934](https://github.com/ClickHouse/ClickHouse/pull/49934) ([Timur Solodovnikov](https://github.com/tsolodov)). From b4bc28b6de86c569bcdfe8d2de0e21ce1717a8c7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 11 Jun 2023 16:48:29 +0300 Subject: [PATCH 0756/1072] Update easy_tasks_sorted_ru.md --- tests/instructions/easy_tasks_sorted_ru.md | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/instructions/easy_tasks_sorted_ru.md b/tests/instructions/easy_tasks_sorted_ru.md index a98a5766ffe..09ea48d0bd9 100644 --- a/tests/instructions/easy_tasks_sorted_ru.md +++ b/tests/instructions/easy_tasks_sorted_ru.md @@ -6,7 +6,7 @@ Делаем `chmod 000 /etc/clickhouse-client/config.xml` и смотрим, что получится. -## Уменьшать max_memory_usage, если на сервере мало оперативки. +## + Уменьшать max_memory_usage, если на сервере мало оперативки. Смотрим, сколько на сервере оперативки. Если `max_memory_usage`, `max_memory_usage_for_all_queries` ограничены, но больше 90% (настройка) от имеющейся оперативки, то уменьшать их и выводить предупреждение в лог.. @@ -42,10 +42,12 @@ void memoryBitAnd(const char * a, const char * b, char * result, size_t size); В ClickHouse есть возможность указать collation для сортировки строк. Это не работает для `Nullable(String)`. -## Запретить чтение значений типа AggregateFunction по-умолчанию и добавить настройку. +## + Запретить чтение значений типа AggregateFunction по-умолчанию и добавить настройку. Состояния агрегатных функций могут быть записаны в дамп и считаны из него. Но десериализация состояний агрегатных функций небезопасна. Аккуратно выбранные пользовательские данные могут привести к segfault или порче памяти. Поэтому нужно просто сделать настройку, которая запрещает читать AggregateFunction из пользовательских данных. +Upd: сделали по-другому: теперь всё безопасно. + ## + В статистику jemalloc добавить информацию по arenas. В `system.asynchronous_metrics` - суммарный размер арен. @@ -56,9 +58,9 @@ void memoryBitAnd(const char * a, const char * b, char * result, size_t size); Как cache, но без кэша — всегда прямой запрос в источник. -## Функции randomFixedString, randomBinaryString, fuzzBits, fuzzBytes. +## + Функции randomFixedString, randomBinaryString, fuzzBits, fuzzBytes. -## Агрегатные функции для статистических тестов (e.g. тест нормальности распределения) и статистик. +## + Агрегатные функции для статистических тестов (e.g. тест нормальности распределения) и статистик. ## + Функции создания и обновления состояния агрегатной функции по одному кортежу аргументов. @@ -119,11 +121,11 @@ position с конца строки. Добавляем счётчики всех ошибок (ErrorCodes) по аналогии с ProfileEvents. Кроме количества запоминаем также время последней ошибки, стек трейс, сообщение. Добавляем системную таблицу system.errors. Отправка в Graphite. -## Добавить Lizard, LZSSE и density в качестве вариантов алгоритмов сжатия. +## + Добавить Lizard, LZSSE и density в качестве вариантов алгоритмов сжатия. Экспериментальные алгоритмы сжатия. Сейчас ClickHouse поддерживает только lz4 и zstd. -## Запрос CREATE OR REPLACE TABLE +## + Запрос CREATE OR REPLACE TABLE Атомарно удаляет таблицу перед созданием новой, если такая была. @@ -149,12 +151,16 @@ https://clickhouse.com/docs/en/query_language/create/#create-table Запретить модификацию данных в партиции. На партицию ставится флаг, что она заблокирована. В неё нельзя делать INSERT и ALTER. С файлов снимается доступ на запись. +Upd: не нужно. + ## Настройка join_use_nulls: поддержка для LEFT ARRAY JOIN. -## Внешние словари из Aerospike/Couchbase/Cassandra (на выбор). +## + Внешние словари из Aerospike/Couchbase/Cassandra (на выбор). Подключить одну из key-value БД как источник. +Upd: сделали Redis, Cassandra, MongoDB. + ## + Движок таблиц Mongo, табличная функция mongo. Возможность легко импортировать данные из MongoDB. @@ -181,7 +187,7 @@ https://clickhouse.com/docs/en/operations/table_engines/external_data/ Не работает, если открыть clickhouse-client в интерактивном режиме и делать несколько запросов. -## Настройка для возможности получить частичный результат при cancel-е. +## + Настройка для возможности получить частичный результат при cancel-е. Хотим по Ctrl+C получить те данные, которые успели обработаться. From 6bdbcd3f436f7f55e0ab71e24d0c96df072d0003 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 11 Jun 2023 14:26:13 +0000 Subject: [PATCH 0757/1072] Stabilize tests --- .../00515_enhanced_time_zones.reference | 26 ++++++++--- .../0_stateless/00515_enhanced_time_zones.sql | 45 ++++++++++--------- 2 files changed, 44 insertions(+), 27 deletions(-) diff --git a/tests/queries/0_stateless/00515_enhanced_time_zones.reference b/tests/queries/0_stateless/00515_enhanced_time_zones.reference index 2ee2c3eac81..2555c885558 100644 --- a/tests/queries/0_stateless/00515_enhanced_time_zones.reference +++ b/tests/queries/0_stateless/00515_enhanced_time_zones.reference @@ -16,10 +16,22 @@ 2017-11-05 08:07:47 2017-11-05 10:37:47 2017-11-05 10:37:47 --- Test const/non-const timezone arguments -- -Asia/Kolkata 2017-11-05 08:07:47 -42 Asia/Kolkata 1970-01-01 00:00:00.042 -42 Asia/Kolkata 1970-01-01 00:00:00.000042 -42 Asia/Kolkata 1970-01-01 00:00:00.000000042 -42 Asia/Kolkata 2010-11-04 01:42:54 -42 Asia/Kolkata 2010-11-04 01:42:54.657 +-- Test const timezone arguments -- +42 +43 +42 +43 +42 +43 +42 +43 +42 +43 +42 +43 +42 +43 +42 +43 +42 +43 diff --git a/tests/queries/0_stateless/00515_enhanced_time_zones.sql b/tests/queries/0_stateless/00515_enhanced_time_zones.sql index 7659b6e4603..5f40cfb53c1 100644 --- a/tests/queries/0_stateless/00515_enhanced_time_zones.sql +++ b/tests/queries/0_stateless/00515_enhanced_time_zones.sql @@ -21,38 +21,43 @@ SELECT toString(toDateTime('2017-11-05 08:07:47', 'Asia/Istanbul')); SELECT toString(toTimeZone(toDateTime('2017-11-05 08:07:47', 'Asia/Istanbul'), 'Asia/Kolkata')); SELECT toString(toDateTime('2017-11-05 08:07:47', 'Asia/Istanbul'), 'Asia/Kolkata'); -SELECT '-- Test const/non-const timezone arguments --'; +SELECT '-- Test const timezone arguments --'; -SELECT materialize('Asia/Kolkata') tz, toTimeZone(toDateTime('2017-11-05 08:07:47', 'Asia/Istanbul'), tz) SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } -SELECT materialize('Asia/Kolkata') tz, toTimeZone(toDateTime('2017-11-05 08:07:47', 'Asia/Istanbul'), tz) SETTINGS allow_nonconst_timezone_arguments = 1; +DROP TABLE IF EXISTS tab; -SELECT materialize('Asia/Kolkata') tz, now(tz) SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } --- SELECT materialize('Asia/Kolkata') tz, now(tz) SETTINGS allow_nonconst_timezone_arguments = 1; +CREATE TABLE tab (val Int64, tz String) engine=Log; +INSERT INTO tab VALUES (42, 'Asia/Singapore') (43, 'Asia/Tokyo'); -SELECT materialize('Asia/Kolkata') tz, now64(9, tz) SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } --- SELECT materialize('Asia/Kolkata') tz, now64(9, tz) SETTINGS allow_nonconst_timezone_arguments = 1; +SELECT val FROM tab WHERE now(tz) != toDateTime('2000-01-01 00:00:00') ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +SELECT val FROM tab WHERE now(tz) != toDateTime('2000-01-01 00:00:00') ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 1; -SELECT materialize('Asia/Kolkata') tz, nowInBlock(tz) SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } --- SELECT materialize('Asia/Kolkata') tz, nowInBlock(tz) SETTINGS allow_nonconst_timezone_arguments = 1; +SELECT val FROM tab WHERE now64(9, tz) != toDateTime64('2000-01-01 00:00:00', 6) ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +SELECT val FROM tab WHERE now64(9, tz) != toDateTime64('2000-01-01 00:00:00', 6) ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 1; -SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, fromUnixTimestamp64Milli(ts, tz) SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } -SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, fromUnixTimestamp64Milli(ts, tz) SETTINGS allow_nonconst_timezone_arguments = 1; +SELECT val FROM tab WHERE nowInBlock(tz) != toDateTime('2000-01-01 00:00:00') ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +SELECT val FROM tab WHERE nowInBlock(tz) != toDateTime('2000-01-01 00:00:00') ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 1; -SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, fromUnixTimestamp64Micro(ts, tz) SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } -SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, fromUnixTimestamp64Micro(ts, tz) SETTINGS allow_nonconst_timezone_arguments = 1; +SELECT val FROM tab WHERE toTimeZone(toDateTime(val), tz) != toDateTime('2023-06-11 14:14:14') ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +SELECT val FROM tab WHERE toTimeZone(toDateTime(val), tz) != toDateTime('2023-06-11 14:14:14') ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 1; -SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, fromUnixTimestamp64Nano(ts, tz) SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } -SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, fromUnixTimestamp64Nano(ts, tz) SETTINGS allow_nonconst_timezone_arguments = 1; +SELECT val FROM tab WHERE fromUnixTimestamp64Milli(val, tz) != toDateTime64('2023-06-11 14:14:14', 6) ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +SELECT val FROM tab WHERE fromUnixTimestamp64Milli(val, tz) != toDateTime64('2023-06-11 14:14:14', 6) ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 1; -SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, snowflakeToDateTime(ts, tz) settings allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } -SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, snowflakeToDateTime(ts, tz) settings allow_nonconst_timezone_arguments = 1; +SELECT val FROM tab WHERE fromUnixTimestamp64Micro(val, tz) != toDateTime64('2023-06-11 14:14:14', 6) ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +SELECT val FROM tab WHERE fromUnixTimestamp64Micro(val, tz) != toDateTime64('2023-06-11 14:14:14', 6) ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 1; -SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, snowflakeToDateTime64(ts, tz) settings allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } -SELECT materialize(42::Int64) ts, materialize('Asia/Kolkata') tz, snowflakeToDateTime64(ts, tz) settings allow_nonconst_timezone_arguments = 1; +SELECT val FROM tab WHERE fromUnixTimestamp64Nano(val, tz) != toDateTime64('2023-06-11 14:14:14', 6) ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +SELECT val FROM tab WHERE fromUnixTimestamp64Nano(val, tz) != toDateTime64('2023-06-11 14:14:14', 6) ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 1; + +SELECT val FROM tab WHERE snowflakeToDateTime(val, tz) != toDateTime('2023-06-11 14:14:14') ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +SELECT val FROM tab WHERE snowflakeToDateTime(val, tz) != toDateTime('2023-06-11 14:14:14') ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 1; + +SELECT val FROM tab WHERE snowflakeToDateTime64(val, tz) != toDateTime64('2023-06-11 14:14:14', 6) ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 0; -- { serverError ILLEGAL_COLUMN } +SELECT val FROM tab WHERE snowflakeToDateTime64(val, tz) != toDateTime64('2023-06-11 14:14:14', 6) ORDER BY val SETTINGS allow_nonconst_timezone_arguments = 1; -- test for a related bug: -DROP TABLE IF EXISTS tab; +DROP TABLE tab; SET allow_nonconst_timezone_arguments = 1; From 48e03ac92a457d612dd8b2e4838dce1e47e51109 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 11 Jun 2023 14:33:21 +0000 Subject: [PATCH 0758/1072] Revert "Exclude some tests with QPL from fasttest" This reverts commit ffb941624bc971886212e0745716e79688a154a1. --- .../0_stateless/00804_test_alter_compression_codecs.sql | 3 --- .../0_stateless/00804_test_custom_compression_codecs.sql | 3 --- .../00804_test_custom_compression_codes_log_storages.sql | 3 --- .../0_stateless/00804_test_deflate_qpl_codec_compression.sql | 3 --- 4 files changed, 12 deletions(-) diff --git a/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql b/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql index eb1abda9a21..fd9855e82d3 100644 --- a/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql +++ b/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql @@ -1,6 +1,3 @@ ---Tags: no-fasttest --- no-fasttest because DEFLATE_QPL isn't available in fasttest - SET send_logs_level = 'fatal'; DROP TABLE IF EXISTS alter_compression_codec; diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql b/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql index df74620a201..89e77f758a7 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql +++ b/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql @@ -1,6 +1,3 @@ ---Tags: no-fasttest --- no-fasttest because DEFLATE_QPL isn't available in fasttest - SET send_logs_level = 'fatal'; SET allow_suspicious_codecs = 1; SET enable_deflate_qpl_codec = 1; diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql b/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql index 67c0074c58f..a629df2666d 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql +++ b/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql @@ -1,6 +1,3 @@ ---Tags: no-fasttest --- no-fasttest because DEFLATE_QPL isn't available in fasttest - SET send_logs_level = 'fatal'; SET allow_suspicious_codecs = 1; SET enable_deflate_qpl_codec = 1; diff --git a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql index a46272112a9..5a56fc0d576 100644 --- a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql +++ b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql @@ -1,6 +1,3 @@ ---Tags: no-fasttest --- no-fasttest because DEFLATE_QPL isn't available in fasttest - SET send_logs_level = 'fatal'; SET enable_deflate_qpl_codec = 1; From d228411f41eabf7e443fbbb2f4148880a3da78fa Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 11 Jun 2023 14:39:15 +0000 Subject: [PATCH 0759/1072] Reset modified tests --- ...04_test_alter_compression_codecs.reference | 17 ++----- .../00804_test_alter_compression_codecs.sql | 22 +++------- ...4_test_custom_compression_codecs.reference | 8 ++-- .../00804_test_custom_compression_codecs.sql | 44 ++++++++----------- ...m_compression_codes_log_storages.reference | 20 ++++----- ..._custom_compression_codes_log_storages.sql | 41 ++++++++--------- ...804_test_deflate_qpl_codec_compression.sql | 4 ++ 7 files changed, 63 insertions(+), 93 deletions(-) diff --git a/tests/queries/0_stateless/00804_test_alter_compression_codecs.reference b/tests/queries/0_stateless/00804_test_alter_compression_codecs.reference index 5c77a102740..cfbfadf1e67 100644 --- a/tests/queries/0_stateless/00804_test_alter_compression_codecs.reference +++ b/tests/queries/0_stateless/00804_test_alter_compression_codecs.reference @@ -18,7 +18,7 @@ CODEC(NONE) 2018-01-01 4 4 2018-01-01 5 5 2018-01-01 6 6 -CODEC(DEFLATE_QPL) +CODEC(ZSTD(1), LZ4HC(0), LZ4, LZ4, NONE) 2018-01-01 1 default_value 2018-01-01 2 default_value 2018-01-01 3 3 @@ -27,18 +27,7 @@ CODEC(DEFLATE_QPL) 2018-01-01 6 6 2018-01-01 7 7 2018-01-01 8 8 -CODEC(ZSTD(1), LZ4HC(0), LZ4, LZ4, DEFLATE_QPL, NONE) -2018-01-01 1 default_value -2018-01-01 2 default_value -2018-01-01 3 3 -2018-01-01 4 4 -2018-01-01 5 5 -2018-01-01 6 6 -2018-01-01 7 7 -2018-01-01 8 8 -2018-01-01 9 9 -2018-01-01 10 10 -CODEC(ZSTD(1), LZ4HC(0), LZ4, LZ4, DEFLATE_QPL, NONE) -CODEC(NONE, LZ4, LZ4HC(0), ZSTD(1), DEFLATE_QPL) +CODEC(ZSTD(1), LZ4HC(0), LZ4, LZ4, NONE) +CODEC(NONE, LZ4, LZ4HC(0), ZSTD(1)) 2 1 diff --git a/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql b/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql index fd9855e82d3..85e5f8b63ad 100644 --- a/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql +++ b/tests/queries/0_stateless/00804_test_alter_compression_codecs.sql @@ -28,20 +28,12 @@ SELECT * FROM alter_compression_codec ORDER BY id; OPTIMIZE TABLE alter_compression_codec FINAL; SELECT * FROM alter_compression_codec ORDER BY id; -SET enable_deflate_qpl_codec = 1; -ALTER TABLE alter_compression_codec MODIFY COLUMN alter_column CODEC(DEFLATE_QPL); +SET allow_suspicious_codecs = 1; +ALTER TABLE alter_compression_codec MODIFY COLUMN alter_column CODEC(ZSTD, LZ4HC, LZ4, LZ4, NONE); SELECT compression_codec FROM system.columns WHERE database = currentDatabase() AND table = 'alter_compression_codec' AND name = 'alter_column'; INSERT INTO alter_compression_codec VALUES('2018-01-01', 7, '7'); INSERT INTO alter_compression_codec VALUES('2018-01-01', 8, '8'); -SELECT * FROM alter_compression_codec ORDER BY id; - -SET allow_suspicious_codecs = 1; -ALTER TABLE alter_compression_codec MODIFY COLUMN alter_column CODEC(ZSTD, LZ4HC, LZ4, LZ4, DEFLATE_QPL, NONE); -SELECT compression_codec FROM system.columns WHERE database = currentDatabase() AND table = 'alter_compression_codec' AND name = 'alter_column'; - -INSERT INTO alter_compression_codec VALUES('2018-01-01', 9, '9'); -INSERT INTO alter_compression_codec VALUES('2018-01-01', 10, '10'); OPTIMIZE TABLE alter_compression_codec FINAL; SELECT * FROM alter_compression_codec ORDER BY id; @@ -62,17 +54,15 @@ ALTER TABLE alter_bad_codec ADD COLUMN alter_column DateTime DEFAULT '2019-01-01 ALTER TABLE alter_bad_codec ADD COLUMN alter_column DateTime DEFAULT '2019-01-01 00:00:00' CODEC(ZSTD(100)); -- { serverError 433 } -ALTER TABLE alter_bad_codec ADD COLUMN alter_column DateTime DEFAULT '2019-01-01 00:00:00' CODEC(DEFLATE_QPL(100)); -- { serverError DATA_TYPE_CANNOT_HAVE_ARGUMENTS } - DROP TABLE IF EXISTS alter_bad_codec; DROP TABLE IF EXISTS large_alter_table_00804; DROP TABLE IF EXISTS store_of_hash_00804; CREATE TABLE large_alter_table_00804 ( - somedate Date CODEC(ZSTD, ZSTD, ZSTD(12), LZ4HC(12), DEFLATE_QPL), - id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC, DEFLATE_QPL), - data String CODEC(ZSTD(2), LZ4HC, NONE, LZ4, LZ4, DEFLATE_QPL) + somedate Date CODEC(ZSTD, ZSTD, ZSTD(12), LZ4HC(12)), + id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC), + data String CODEC(ZSTD(2), LZ4HC, NONE, LZ4, LZ4) ) ENGINE = MergeTree() PARTITION BY somedate ORDER BY id SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi', min_bytes_for_wide_part = 0; INSERT INTO large_alter_table_00804 SELECT toDate('2019-01-01'), number, toString(number + rand()) FROM system.numbers LIMIT 300000; @@ -81,7 +71,7 @@ CREATE TABLE store_of_hash_00804 (hash UInt64) ENGINE = Memory(); INSERT INTO store_of_hash_00804 SELECT sum(cityHash64(*)) FROM large_alter_table_00804; -ALTER TABLE large_alter_table_00804 MODIFY COLUMN data CODEC(NONE, LZ4, LZ4HC, ZSTD, DEFLATE_QPL); +ALTER TABLE large_alter_table_00804 MODIFY COLUMN data CODEC(NONE, LZ4, LZ4HC, ZSTD); OPTIMIZE TABLE large_alter_table_00804; diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codecs.reference b/tests/queries/0_stateless/00804_test_custom_compression_codecs.reference index 8b51d65004a..7bd91e5a69b 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codecs.reference +++ b/tests/queries/0_stateless/00804_test_custom_compression_codecs.reference @@ -1,6 +1,6 @@ -1 hello 2018-12-14 2018-12-14 1.1 aaa 5 -2 world 2018-12-15 2018-12-15 2.2 bbb 6 -3 ! 2018-12-16 2018-12-16 3.3 ccc 7 +1 hello 2018-12-14 1.1 aaa 5 +2 world 2018-12-15 2.2 bbb 6 +3 ! 2018-12-16 3.3 ccc 7 2 1 world 2018-10-05 1.1 2 hello 2018-10-01 2.2 @@ -9,7 +9,7 @@ 10003 274972506.6 9175437371954010821 -CREATE TABLE default.compression_codec_multiple_more_types\n(\n `id` Decimal(38, 13) CODEC(ZSTD(1), LZ4, ZSTD(1), ZSTD(1), Delta(2), Delta(4), Delta(1), LZ4HC(0), DEFLATE_QPL),\n `data` FixedString(12) CODEC(ZSTD(1), ZSTD(1), NONE, NONE, NONE, LZ4HC(0), DEFLATE_QPL),\n `ddd.age` Array(UInt8) CODEC(LZ4, LZ4HC(0), NONE, NONE, NONE, ZSTD(1), Delta(8), DEFLATE_QPL),\n `ddd.Name` Array(String) CODEC(LZ4, LZ4HC(0), NONE, NONE, NONE, ZSTD(1), Delta(8), DEFLATE_QPL)\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 +CREATE TABLE default.compression_codec_multiple_more_types\n(\n `id` Decimal(38, 13) CODEC(ZSTD(1), LZ4, ZSTD(1), ZSTD(1), Delta(2), Delta(4), Delta(1), LZ4HC(0)),\n `data` FixedString(12) CODEC(ZSTD(1), ZSTD(1), NONE, NONE, NONE, LZ4HC(0)),\n `ddd.age` Array(UInt8) CODEC(LZ4, LZ4HC(0), NONE, NONE, NONE, ZSTD(1), Delta(8)),\n `ddd.Name` Array(String) CODEC(LZ4, LZ4HC(0), NONE, NONE, NONE, ZSTD(1), Delta(8))\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 1.5555555555555 hello world! [77] ['John'] 7.1 xxxxxxxxxxxx [127] ['Henry'] ! diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql b/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql index 89e77f758a7..c080c2fc98e 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql +++ b/tests/queries/0_stateless/00804_test_custom_compression_codecs.sql @@ -1,6 +1,5 @@ SET send_logs_level = 'fatal'; SET allow_suspicious_codecs = 1; -SET enable_deflate_qpl_codec = 1; DROP TABLE IF EXISTS compression_codec; @@ -8,21 +7,20 @@ CREATE TABLE compression_codec( id UInt64 CODEC(LZ4), data String CODEC(ZSTD), ddd Date CODEC(NONE), - ddd32 Date32 CODEC(DEFLATE_QPL), somenum Float64 CODEC(ZSTD(2)), somestr FixedString(3) CODEC(LZ4HC(7)), - othernum Int64 CODEC(Delta), + othernum Int64 CODEC(Delta) ) ENGINE = MergeTree() ORDER BY tuple(); -INSERT INTO compression_codec VALUES(1, 'hello', toDate('2018-12-14'), toDate32('2018-12-14'), 1.1, 'aaa', 5); -INSERT INTO compression_codec VALUES(2, 'world', toDate('2018-12-15'), toDate32('2018-12-15'), 2.2, 'bbb', 6); -INSERT INTO compression_codec VALUES(3, '!', toDate('2018-12-16'), toDate32('2018-12-16'), 3.3, 'ccc', 7); +INSERT INTO compression_codec VALUES(1, 'hello', toDate('2018-12-14'), 1.1, 'aaa', 5); +INSERT INTO compression_codec VALUES(2, 'world', toDate('2018-12-15'), 2.2, 'bbb', 6); +INSERT INTO compression_codec VALUES(3, '!', toDate('2018-12-16'), 3.3, 'ccc', 7); SELECT * FROM compression_codec ORDER BY id; OPTIMIZE TABLE compression_codec FINAL; -INSERT INTO compression_codec VALUES(2, '', toDate('2018-12-13'), toDate32('2018-12-13'), 4.4, 'ddd', 8); +INSERT INTO compression_codec VALUES(2, '', toDate('2018-12-13'), 4.4, 'ddd', 8); DETACH TABLE compression_codec; ATTACH TABLE compression_codec; @@ -33,31 +31,25 @@ DROP TABLE IF EXISTS compression_codec; DROP TABLE IF EXISTS bad_codec; DROP TABLE IF EXISTS params_when_no_params; -DROP TABLE IF EXISTS params_when_no_params2; DROP TABLE IF EXISTS too_many_params; DROP TABLE IF EXISTS codec_multiple_direct_specification_1; DROP TABLE IF EXISTS codec_multiple_direct_specification_2; -DROP TABLE IF EXISTS codec_multiple_direct_specification_3; DROP TABLE IF EXISTS delta_bad_params1; DROP TABLE IF EXISTS delta_bad_params2; CREATE TABLE bad_codec(id UInt64 CODEC(adssadads)) ENGINE = MergeTree() order by tuple(); -- { serverError 432 } CREATE TABLE too_many_params(id UInt64 CODEC(ZSTD(2,3,4,5))) ENGINE = MergeTree() order by tuple(); -- { serverError 431 } CREATE TABLE params_when_no_params(id UInt64 CODEC(LZ4(1))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 378 } -CREATE TABLE params_when_no_params2(id UInt64 CODEC(DEFLATE_QPL(1))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 378 } CREATE TABLE codec_multiple_direct_specification_1(id UInt64 CODEC(MULTIPLE(LZ4, ZSTD))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 432 } CREATE TABLE codec_multiple_direct_specification_2(id UInt64 CODEC(multiple(LZ4, ZSTD))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 432 } -CREATE TABLE codec_multiple_direct_specification_3(id UInt64 CODEC(multiple(LZ4, DEFLATE_QPL))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 432 } CREATE TABLE delta_bad_params1(id UInt64 CODEC(Delta(3))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 433 } CREATE TABLE delta_bad_params2(id UInt64 CODEC(Delta(16))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 433 } DROP TABLE IF EXISTS bad_codec; DROP TABLE IF EXISTS params_when_no_params; -DROP TABLE IF EXISTS params_when_no_params2; DROP TABLE IF EXISTS too_many_params; DROP TABLE IF EXISTS codec_multiple_direct_specification_1; DROP TABLE IF EXISTS codec_multiple_direct_specification_2; -DROP TABLE IF EXISTS codec_multiple_direct_specification_3; DROP TABLE IF EXISTS delta_bad_params1; DROP TABLE IF EXISTS delta_bad_params2; @@ -66,10 +58,10 @@ DROP TABLE IF EXISTS compression_codec_multiple; SET network_compression_method = 'lz4hc'; CREATE TABLE compression_codec_multiple ( - id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC, Delta(4), DEFLATE_QPL), - data String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC, LZ4, LZ4, Delta(8), DEFLATE_QPL), - ddd Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD, LZ4HC, LZ4HC, DEFLATE_QPL), - somenum Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD, DEFLATE_QPL) + id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC, Delta(4)), + data String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC, LZ4, LZ4, Delta(8)), + ddd Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD, LZ4HC, LZ4HC), + somenum Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD) ) ENGINE = MergeTree() ORDER BY tuple(); INSERT INTO compression_codec_multiple VALUES (1, 'world', toDate('2018-10-05'), 1.1), (2, 'hello', toDate('2018-10-01'), 2.2), (3, 'buy', toDate('2018-10-11'), 3.3); @@ -93,15 +85,15 @@ SELECT sum(cityHash64(*)) FROM compression_codec_multiple; DROP TABLE IF EXISTS compression_codec_multiple_more_types; CREATE TABLE compression_codec_multiple_more_types ( - id Decimal128(13) CODEC(ZSTD, LZ4, ZSTD, ZSTD, Delta(2), Delta(4), Delta(1), LZ4HC, DEFLATE_QPL), - data FixedString(12) CODEC(ZSTD, ZSTD, Delta, Delta, Delta, NONE, NONE, NONE, LZ4HC, DEFLATE_QPL), - ddd Nested (age UInt8, Name String) CODEC(LZ4, LZ4HC, NONE, NONE, NONE, ZSTD, Delta(8), DEFLATE_QPL) + id Decimal128(13) CODEC(ZSTD, LZ4, ZSTD, ZSTD, Delta(2), Delta(4), Delta(1), LZ4HC), + data FixedString(12) CODEC(ZSTD, ZSTD, Delta, Delta, Delta, NONE, NONE, NONE, LZ4HC), + ddd Nested (age UInt8, Name String) CODEC(LZ4, LZ4HC, NONE, NONE, NONE, ZSTD, Delta(8)) ) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 36 } CREATE TABLE compression_codec_multiple_more_types ( - id Decimal128(13) CODEC(ZSTD, LZ4, ZSTD, ZSTD, Delta(2), Delta(4), Delta(1), LZ4HC, DEFLATE_QPL), - data FixedString(12) CODEC(ZSTD, ZSTD, NONE, NONE, NONE, LZ4HC, DEFLATE_QPL), - ddd Nested (age UInt8, Name String) CODEC(LZ4, LZ4HC, NONE, NONE, NONE, ZSTD, Delta(8), DEFLATE_QPL) + id Decimal128(13) CODEC(ZSTD, LZ4, ZSTD, ZSTD, Delta(2), Delta(4), Delta(1), LZ4HC), + data FixedString(12) CODEC(ZSTD, ZSTD, NONE, NONE, NONE, LZ4HC), + ddd Nested (age UInt8, Name String) CODEC(LZ4, LZ4HC, NONE, NONE, NONE, ZSTD, Delta(8)) ) ENGINE = MergeTree() ORDER BY tuple(); SHOW CREATE TABLE compression_codec_multiple_more_types; @@ -117,9 +109,9 @@ SET network_compression_method = 'zstd'; SET network_zstd_compression_level = 5; CREATE TABLE compression_codec_multiple_with_key ( - somedate Date CODEC(ZSTD, ZSTD, ZSTD(12), LZ4HC(12), Delta, Delta, DEFLATE_QPL), - id UInt64 CODEC(LZ4, ZSTD, Delta, NONE, LZ4HC, Delta, DEFLATE_QPL), - data String CODEC(ZSTD(2), Delta(1), LZ4HC, NONE, LZ4, LZ4, DEFLATE_QPL) + somedate Date CODEC(ZSTD, ZSTD, ZSTD(12), LZ4HC(12), Delta, Delta), + id UInt64 CODEC(LZ4, ZSTD, Delta, NONE, LZ4HC, Delta), + data String CODEC(ZSTD(2), Delta(1), LZ4HC, NONE, LZ4, LZ4) ) ENGINE = MergeTree() PARTITION BY somedate ORDER BY id SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi'; diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.reference b/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.reference index d64b8a77eed..8145ca99829 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.reference +++ b/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.reference @@ -1,9 +1,9 @@ -CREATE TABLE default.compression_codec_log\n(\n `id` UInt64 CODEC(LZ4),\n `data` String CODEC(ZSTD(1)),\n `ddd` Date CODEC(NONE),\n `somenum` Float64 CODEC(ZSTD(2)),\n `somestr` FixedString(3) CODEC(LZ4HC(7)),\n `othernum` Int64 CODEC(Delta(8)),\n `qplstr` String CODEC(DEFLATE_QPL),\n `qplnum` UInt32 CODEC(DEFLATE_QPL)\n)\nENGINE = Log -1 hello 2018-12-14 1.1 aaa 5 qpl11 11 -2 world 2018-12-15 2.2 bbb 6 qpl22 22 -3 ! 2018-12-16 3.3 ccc 7 qpl33 33 +CREATE TABLE default.compression_codec_log\n(\n `id` UInt64 CODEC(LZ4),\n `data` String CODEC(ZSTD(1)),\n `ddd` Date CODEC(NONE),\n `somenum` Float64 CODEC(ZSTD(2)),\n `somestr` FixedString(3) CODEC(LZ4HC(7)),\n `othernum` Int64 CODEC(Delta(8))\n)\nENGINE = Log +1 hello 2018-12-14 1.1 aaa 5 +2 world 2018-12-15 2.2 bbb 6 +3 ! 2018-12-16 3.3 ccc 7 2 -CREATE TABLE default.compression_codec_multiple_log\n(\n `id` UInt64 CODEC(LZ4, ZSTD(1), NONE, LZ4HC(0), Delta(4), DEFLATE_QPL),\n `data` String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC(0), LZ4, LZ4, Delta(8), DEFLATE_QPL),\n `ddd` Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD(1), LZ4HC(0), LZ4HC(0), DEFLATE_QPL),\n `somenum` Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD(1), DEFLATE_QPL)\n)\nENGINE = Log +CREATE TABLE default.compression_codec_multiple_log\n(\n `id` UInt64 CODEC(LZ4, ZSTD(1), NONE, LZ4HC(0), Delta(4)),\n `data` String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC(0), LZ4, LZ4, Delta(8)),\n `ddd` Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD(1), LZ4HC(0), LZ4HC(0)),\n `somenum` Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD(1))\n)\nENGINE = Log 1 world 2018-10-05 1.1 2 hello 2018-10-01 2.2 3 buy 2018-10-11 3.3 @@ -11,12 +11,12 @@ CREATE TABLE default.compression_codec_multiple_log\n(\n `id` UInt64 CODEC(LZ 10003 274972506.6 9175437371954010821 -CREATE TABLE default.compression_codec_tiny_log\n(\n `id` UInt64 CODEC(LZ4),\n `data` String CODEC(ZSTD(1)),\n `ddd` Date CODEC(NONE),\n `somenum` Float64 CODEC(ZSTD(2)),\n `somestr` FixedString(3) CODEC(LZ4HC(7)),\n `othernum` Int64 CODEC(Delta(8)),\n `qplstr` String CODEC(DEFLATE_QPL),\n `qplnum` UInt32 CODEC(DEFLATE_QPL)\n)\nENGINE = TinyLog -1 hello 2018-12-14 1.1 aaa 5 qpl11 11 -2 world 2018-12-15 2.2 bbb 6 qpl22 22 -3 ! 2018-12-16 3.3 ccc 7 qpl33 33 +CREATE TABLE default.compression_codec_tiny_log\n(\n `id` UInt64 CODEC(LZ4),\n `data` String CODEC(ZSTD(1)),\n `ddd` Date CODEC(NONE),\n `somenum` Float64 CODEC(ZSTD(2)),\n `somestr` FixedString(3) CODEC(LZ4HC(7)),\n `othernum` Int64 CODEC(Delta(8))\n)\nENGINE = TinyLog +1 hello 2018-12-14 1.1 aaa 5 +2 world 2018-12-15 2.2 bbb 6 +3 ! 2018-12-16 3.3 ccc 7 2 -CREATE TABLE default.compression_codec_multiple_tiny_log\n(\n `id` UInt64 CODEC(LZ4, ZSTD(1), NONE, LZ4HC(0), Delta(4), DEFLATE_QPL),\n `data` String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC(0), LZ4, LZ4, Delta(8), DEFLATE_QPL),\n `ddd` Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD(1), LZ4HC(0), LZ4HC(0), DEFLATE_QPL),\n `somenum` Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD(1), DEFLATE_QPL)\n)\nENGINE = TinyLog +CREATE TABLE default.compression_codec_multiple_tiny_log\n(\n `id` UInt64 CODEC(LZ4, ZSTD(1), NONE, LZ4HC(0), Delta(4)),\n `data` String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC(0), LZ4, LZ4, Delta(8)),\n `ddd` Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD(1), LZ4HC(0), LZ4HC(0)),\n `somenum` Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD(1))\n)\nENGINE = TinyLog 1 world 2018-10-05 1.1 2 hello 2018-10-01 2.2 3 buy 2018-10-11 3.3 diff --git a/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql b/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql index a629df2666d..fba6a216762 100644 --- a/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql +++ b/tests/queries/0_stateless/00804_test_custom_compression_codes_log_storages.sql @@ -1,6 +1,5 @@ SET send_logs_level = 'fatal'; SET allow_suspicious_codecs = 1; -SET enable_deflate_qpl_codec = 1; -- copy-paste for storage log @@ -12,20 +11,18 @@ CREATE TABLE compression_codec_log( ddd Date CODEC(NONE), somenum Float64 CODEC(ZSTD(2)), somestr FixedString(3) CODEC(LZ4HC(7)), - othernum Int64 CODEC(Delta), - qplstr String CODEC(DEFLATE_QPL), - qplnum UInt32 CODEC(DEFLATE_QPL), + othernum Int64 CODEC(Delta) ) ENGINE = Log(); SHOW CREATE TABLE compression_codec_log; -INSERT INTO compression_codec_log VALUES(1, 'hello', toDate('2018-12-14'), 1.1, 'aaa', 5, 'qpl11', 11); -INSERT INTO compression_codec_log VALUES(2, 'world', toDate('2018-12-15'), 2.2, 'bbb', 6,'qpl22', 22); -INSERT INTO compression_codec_log VALUES(3, '!', toDate('2018-12-16'), 3.3, 'ccc', 7, 'qpl33', 33); +INSERT INTO compression_codec_log VALUES(1, 'hello', toDate('2018-12-14'), 1.1, 'aaa', 5); +INSERT INTO compression_codec_log VALUES(2, 'world', toDate('2018-12-15'), 2.2, 'bbb', 6); +INSERT INTO compression_codec_log VALUES(3, '!', toDate('2018-12-16'), 3.3, 'ccc', 7); SELECT * FROM compression_codec_log ORDER BY id; -INSERT INTO compression_codec_log VALUES(2, '', toDate('2018-12-13'), 4.4, 'ddd', 8, 'qpl44', 44); +INSERT INTO compression_codec_log VALUES(2, '', toDate('2018-12-13'), 4.4, 'ddd', 8); DETACH TABLE compression_codec_log; ATTACH TABLE compression_codec_log; @@ -37,10 +34,10 @@ DROP TABLE IF EXISTS compression_codec_log; DROP TABLE IF EXISTS compression_codec_multiple_log; CREATE TABLE compression_codec_multiple_log ( - id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC, Delta(4), DEFLATE_QPL), - data String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC, LZ4, LZ4, Delta(8), DEFLATE_QPL), - ddd Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD, LZ4HC, LZ4HC, DEFLATE_QPL), - somenum Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD, DEFLATE_QPL) + id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC, Delta(4)), + data String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC, LZ4, LZ4, Delta(8)), + ddd Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD, LZ4HC, LZ4HC), + somenum Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD) ) ENGINE = Log(); SHOW CREATE TABLE compression_codec_multiple_log; @@ -72,20 +69,18 @@ CREATE TABLE compression_codec_tiny_log( ddd Date CODEC(NONE), somenum Float64 CODEC(ZSTD(2)), somestr FixedString(3) CODEC(LZ4HC(7)), - othernum Int64 CODEC(Delta), - qplstr String CODEC(DEFLATE_QPL), - qplnum UInt32 CODEC(DEFLATE_QPL), + othernum Int64 CODEC(Delta) ) ENGINE = TinyLog(); SHOW CREATE TABLE compression_codec_tiny_log; -INSERT INTO compression_codec_tiny_log VALUES(1, 'hello', toDate('2018-12-14'), 1.1, 'aaa', 5, 'qpl11', 11); -INSERT INTO compression_codec_tiny_log VALUES(2, 'world', toDate('2018-12-15'), 2.2, 'bbb', 6, 'qpl22', 22); -INSERT INTO compression_codec_tiny_log VALUES(3, '!', toDate('2018-12-16'), 3.3, 'ccc', 7, 'qpl33', 33); +INSERT INTO compression_codec_tiny_log VALUES(1, 'hello', toDate('2018-12-14'), 1.1, 'aaa', 5); +INSERT INTO compression_codec_tiny_log VALUES(2, 'world', toDate('2018-12-15'), 2.2, 'bbb', 6); +INSERT INTO compression_codec_tiny_log VALUES(3, '!', toDate('2018-12-16'), 3.3, 'ccc', 7); SELECT * FROM compression_codec_tiny_log ORDER BY id; -INSERT INTO compression_codec_tiny_log VALUES(2, '', toDate('2018-12-13'), 4.4, 'ddd', 8, 'qpl44', 44); +INSERT INTO compression_codec_tiny_log VALUES(2, '', toDate('2018-12-13'), 4.4, 'ddd', 8); DETACH TABLE compression_codec_tiny_log; ATTACH TABLE compression_codec_tiny_log; @@ -97,10 +92,10 @@ DROP TABLE IF EXISTS compression_codec_tiny_log; DROP TABLE IF EXISTS compression_codec_multiple_tiny_log; CREATE TABLE compression_codec_multiple_tiny_log ( - id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC, Delta(4), DEFLATE_QPL), - data String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC, LZ4, LZ4, Delta(8), DEFLATE_QPL), - ddd Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD, LZ4HC, LZ4HC, DEFLATE_QPL), - somenum Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD, DEFLATE_QPL) + id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC, Delta(4)), + data String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC, LZ4, LZ4, Delta(8)), + ddd Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD, LZ4HC, LZ4HC), + somenum Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD) ) ENGINE = TinyLog(); SHOW CREATE TABLE compression_codec_multiple_tiny_log; diff --git a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql index 5a56fc0d576..78c57013eeb 100644 --- a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql +++ b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql @@ -1,3 +1,7 @@ +--Tags: no-fasttest, no-cpu-aarch64 +-- no-fasttest because DEFLATE_QPL isn't available in fasttest +-- no-cpu-aarch64 because DEFLATE_QPL is x86-only + SET send_logs_level = 'fatal'; SET enable_deflate_qpl_codec = 1; From d72751be27ba5f69337a0039f41e577c05a3ae7f Mon Sep 17 00:00:00 2001 From: alekseygolub Date: Sun, 11 Jun 2023 15:01:45 +0000 Subject: [PATCH 0760/1072] Added cache invalidation; Fix issues --- src/Databases/DatabaseFactory.cpp | 5 +-- src/Databases/DatabaseFilesystem.cpp | 40 ++++++++++++++----- src/Databases/DatabaseFilesystem.h | 8 +++- src/Databases/DatabaseHDFS.cpp | 6 +++ src/Databases/DatabaseHDFS.h | 5 ++- src/Databases/DatabaseS3.cpp | 14 +++---- src/Databases/DatabaseS3.h | 5 ++- .../0_stateless/02724_database_s3.reference | 4 -- .../queries/0_stateless/02724_database_s3.sh | 6 --- 9 files changed, 59 insertions(+), 34 deletions(-) diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index 26952cc574e..9d90c61bb41 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -15,10 +15,9 @@ #include #include #include -#include -#include -#include #include +#include +#include #include "config.h" diff --git a/src/Databases/DatabaseFilesystem.cpp b/src/Databases/DatabaseFilesystem.cpp index 8de609f0ca2..cf45240a5f0 100644 --- a/src/Databases/DatabaseFilesystem.cpp +++ b/src/Databases/DatabaseFilesystem.cpp @@ -94,14 +94,32 @@ bool DatabaseFilesystem::checkTableFilePath(const std::string & table_path, Cont return true; } -bool DatabaseFilesystem::isTableExist(const String & name, ContextPtr context_) const +StoragePtr DatabaseFilesystem::tryGetTableFromCache(const std::string & name) const { + StoragePtr table = nullptr; { std::lock_guard lock(mutex); - if (loaded_tables.find(name) != loaded_tables.end()) - return true; + auto it = loaded_tables.find(name); + if (it != loaded_tables.end()) + table = it->second; } + // invalidate cache if file no longer exists + if (table && !fs::exists(getTablePath(name))) + { + std::lock_guard lock(mutex); + loaded_tables.erase(name); + return nullptr; + } + + return table; +} + +bool DatabaseFilesystem::isTableExist(const String & name, ContextPtr context_) const +{ + if (tryGetTableFromCache(name)) + return true; + fs::path table_file_path(getTablePath(name)); return checkTableFilePath(table_file_path, context_, false); @@ -109,13 +127,9 @@ bool DatabaseFilesystem::isTableExist(const String & name, ContextPtr context_) StoragePtr DatabaseFilesystem::getTableImpl(const String & name, ContextPtr context_) const { - // Check if the table exists in the loaded tables map - { - std::lock_guard lock(mutex); - auto it = loaded_tables.find(name); - if (it != loaded_tables.end()) - return it->second; - } + // Check if table exists in loaded tables map + if (auto table = tryGetTableFromCache(name)) + return table; auto table_path = getTablePath(name); @@ -165,6 +179,12 @@ StoragePtr DatabaseFilesystem::tryGetTable(const String & name, ContextPtr conte } } +bool DatabaseFilesystem::empty() const +{ + std::lock_guard lock(mutex); + return loaded_tables.empty(); +} + ASTPtr DatabaseFilesystem::getCreateDatabaseQuery() const { auto settings = getContext()->getSettingsRef(); diff --git a/src/Databases/DatabaseFilesystem.h b/src/Databases/DatabaseFilesystem.h index 3d2ad695cc6..350ebfe37a3 100644 --- a/src/Databases/DatabaseFilesystem.h +++ b/src/Databases/DatabaseFilesystem.h @@ -31,7 +31,10 @@ public: StoragePtr tryGetTable(const String & name, ContextPtr context) const override; - bool empty() const override { return true; } + // Contains only temporary tables + bool shouldBeEmptyOnDetach() const override { return false; } + + bool empty() const override; bool isReadOnly() const override { return true; } @@ -45,13 +48,14 @@ public: protected: StoragePtr getTableImpl(const String & name, ContextPtr context) const; + StoragePtr tryGetTableFromCache(const std::string & name) const; + std::string getTablePath(const std::string & table_name) const; void addTable(const std::string & table_name, StoragePtr table_storage) const; bool checkTableFilePath(const std::string & table_path, ContextPtr context_, bool throw_on_error) const; - private: String path; mutable Tables loaded_tables TSA_GUARDED_BY(mutex); diff --git a/src/Databases/DatabaseHDFS.cpp b/src/Databases/DatabaseHDFS.cpp index 39c3f955bf5..34cb337cdbe 100644 --- a/src/Databases/DatabaseHDFS.cpp +++ b/src/Databases/DatabaseHDFS.cpp @@ -170,6 +170,12 @@ StoragePtr DatabaseHDFS::tryGetTable(const String & name, ContextPtr context_) c } } +bool DatabaseHDFS::empty() const +{ + std::lock_guard lock(mutex); + return loaded_tables.empty(); +} + ASTPtr DatabaseHDFS::getCreateDatabaseQuery() const { auto settings = getContext()->getSettingsRef(); diff --git a/src/Databases/DatabaseHDFS.h b/src/Databases/DatabaseHDFS.h index 9a506c5c8ac..c7071370b5e 100644 --- a/src/Databases/DatabaseHDFS.h +++ b/src/Databases/DatabaseHDFS.h @@ -33,7 +33,10 @@ public: StoragePtr tryGetTable(const String & name, ContextPtr context) const override; - bool empty() const override { return true; } + // Contains only temporary tables + bool shouldBeEmptyOnDetach() const override { return false; } + + bool empty() const override; bool isReadOnly() const override { return true; } diff --git a/src/Databases/DatabaseS3.cpp b/src/Databases/DatabaseS3.cpp index 96616426475..46f8a67687d 100644 --- a/src/Databases/DatabaseS3.cpp +++ b/src/Databases/DatabaseS3.cpp @@ -67,14 +67,8 @@ void DatabaseS3::addTable(const std::string & table_name, StoragePtr table_stora std::string DatabaseS3::getFullUrl(const std::string & name) const { - try - { - S3::URI uri(name); - } - catch (...) - { + if (!config.url_prefix.empty()) return (fs::path(config.url_prefix) / name).string(); - } return name; } @@ -181,6 +175,12 @@ StoragePtr DatabaseS3::tryGetTable(const String & name, ContextPtr context_) con } } +bool DatabaseS3::empty() const +{ + std::lock_guard lock(mutex); + return loaded_tables.empty(); +} + ASTPtr DatabaseS3::getCreateDatabaseQuery() const { auto settings = getContext()->getSettingsRef(); diff --git a/src/Databases/DatabaseS3.h b/src/Databases/DatabaseS3.h index 4e6910566df..f494925b09b 100644 --- a/src/Databases/DatabaseS3.h +++ b/src/Databases/DatabaseS3.h @@ -43,7 +43,10 @@ public: StoragePtr tryGetTable(const String & name, ContextPtr context) const override; - bool empty() const override { return true; } + // Contains only temporary tables + bool shouldBeEmptyOnDetach() const override { return false; } + + bool empty() const override; bool isReadOnly() const override { return true; } diff --git a/tests/queries/0_stateless/02724_database_s3.reference b/tests/queries/0_stateless/02724_database_s3.reference index 811e38b7f2b..425cca6a077 100644 --- a/tests/queries/0_stateless/02724_database_s3.reference +++ b/tests/queries/0_stateless/02724_database_s3.reference @@ -12,10 +12,6 @@ test1 13 14 15 16 17 18 0 0 0 -1 2 3 -4 5 6 -7 8 9 -0 0 0 10 11 12 13 14 15 16 17 18 diff --git a/tests/queries/0_stateless/02724_database_s3.sh b/tests/queries/0_stateless/02724_database_s3.sh index ac1b97beecf..79199b43571 100755 --- a/tests/queries/0_stateless/02724_database_s3.sh +++ b/tests/queries/0_stateless/02724_database_s3.sh @@ -32,12 +32,6 @@ USE test4; SELECT * FROM \"b.tsv\" """ -# check that database url_prefix is ignored if pass full url as table name -${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ -USE test4; -SELECT * FROM \"http://localhost:11111/test/a.tsv\" -""" - # Check named collection loading ${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ DROP DATABASE IF EXISTS test5; From 2419a7b90fd1effd8ebf8b5b4741a0325447cdec Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 11 Jun 2023 15:16:52 +0000 Subject: [PATCH 0761/1072] Fix tests --- .../00804_test_deflate_qpl_codec_compression.reference | 2 ++ .../00804_test_deflate_qpl_codec_compression.sql | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.reference b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.reference index 276747f8233..a2178f5eda7 100644 --- a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.reference +++ b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.reference @@ -1,4 +1,6 @@ +CREATE TABLE default.compression_codec\n(\n `id` UInt64 CODEC(DEFLATE_QPL),\n `data` String CODEC(DEFLATE_QPL),\n `ddd` Date CODEC(DEFLATE_QPL),\n `ddd32` Date32 CODEC(DEFLATE_QPL),\n `somenum` Float64 CODEC(DEFLATE_QPL),\n `somestr` FixedString(3) CODEC(DEFLATE_QPL),\n `othernum` Int64 CODEC(DEFLATE_QPL),\n `somearray` Array(UInt8) CODEC(DEFLATE_QPL),\n `somemap` Map(String, UInt32) CODEC(DEFLATE_QPL),\n `sometuple` Tuple(UInt16, UInt64) CODEC(DEFLATE_QPL)\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 1 hello 2018-12-14 2018-12-14 1.1 aaa 5 [1,2,3] {'k1':1,'k2':2} (1,2) 2 world 2018-12-15 2018-12-15 2.2 bbb 6 [4,5,6] {'k3':3,'k4':4} (3,4) 3 ! 2018-12-16 2018-12-16 3.3 ccc 7 [7,8,9] {'k5':5,'k6':6} (5,6) 2 +10001 diff --git a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql index 78c57013eeb..ff3c1812c86 100644 --- a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql +++ b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql @@ -2,7 +2,8 @@ -- no-fasttest because DEFLATE_QPL isn't available in fasttest -- no-cpu-aarch64 because DEFLATE_QPL is x86-only -SET send_logs_level = 'fatal'; +-- A bunch of random DDLs to test the DEFLATE_QPL codec. + SET enable_deflate_qpl_codec = 1; DROP TABLE IF EXISTS compression_codec; @@ -20,6 +21,8 @@ CREATE TABLE compression_codec( sometuple Tuple(UInt16, UInt64) CODEC(DEFLATE_QPL), ) ENGINE = MergeTree() ORDER BY tuple(); +SHOW CREATE TABLE compression_codec; + INSERT INTO compression_codec VALUES(1, 'hello', toDate('2018-12-14'), toDate32('2018-12-14'), 1.1, 'aaa', 5, [1,2,3], map('k1',1,'k2',2), tuple(1,2)); INSERT INTO compression_codec VALUES(2, 'world', toDate('2018-12-15'), toDate32('2018-12-15'), 2.2, 'bbb', 6, [4,5,6], map('k3',3,'k4',4), tuple(3,4)); INSERT INTO compression_codec VALUES(3, '!', toDate('2018-12-16'), toDate32('2018-12-16'), 3.3, 'ccc', 7, [7,8,9], map('k5',5,'k6',6), tuple(5,6)); @@ -35,4 +38,8 @@ ATTACH TABLE compression_codec; SELECT count(*) FROM compression_codec WHERE id = 2 GROUP BY id; +INSERT INTO compression_codec SELECT 3, '!', toDate('2018-12-16'), toDate32('2018-12-16'), 3.3, 'ccc', 7, [7,8,9], map('k5',5,'k6',6), tuple(5,6) FROM system.numbers LIMIT 10000; + +SELECT count(*) FROM compression_codec WHERE id = 3 GROUP BY id; + DROP TABLE IF EXISTS compression_codec; From 598501011f5cbedb42188b2f828c055d44a0fcd8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 11 Jun 2023 17:51:54 +0200 Subject: [PATCH 0762/1072] Relax "too many parts" further --- programs/server/Server.cpp | 1 - src/Loggers/OwnPatternFormatter.cpp | 1 - src/Storages/MergeTree/MergeTreeData.cpp | 10 +++++----- src/Storages/MergeTree/MergeTreeData.h | 2 +- src/Storages/MergeTree/MergeTreeSettings.h | 6 +++--- src/Storages/MergeTree/MergeTreeSink.cpp | 9 +++++++-- src/Storages/MergeTree/MergeTreeSink.h | 3 ++- src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp | 9 +++++++-- src/Storages/MergeTree/ReplicatedMergeTreeSink.h | 1 + 9 files changed, 26 insertions(+), 16 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index d0fc8aca5e8..cfef7f0a94a 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1705,7 +1705,6 @@ try #endif /// Must be done after initialization of `servers`, because async_metrics will access `servers` variable from its thread. - async_metrics.start(); { diff --git a/src/Loggers/OwnPatternFormatter.cpp b/src/Loggers/OwnPatternFormatter.cpp index ccf6c479b80..0c2256aaa1b 100644 --- a/src/Loggers/OwnPatternFormatter.cpp +++ b/src/Loggers/OwnPatternFormatter.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 9cca471fddb..b42d130bf62 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -4315,14 +4315,14 @@ std::optional MergeTreeData::getMinPartDataVersion() const } -void MergeTreeData::delayInsertOrThrowIfNeeded(Poco::Event * until, const ContextPtr & query_context) const +void MergeTreeData::delayInsertOrThrowIfNeeded(Poco::Event * until, const ContextPtr & query_context, bool allow_throw) const { const auto settings = getSettings(); const auto & query_settings = query_context->getSettingsRef(); const size_t parts_count_in_total = getActivePartsCount(); - /// check if have too many parts in total - if (parts_count_in_total >= settings->max_parts_in_total) + /// Check if we have too many parts in total + if (allow_throw && parts_count_in_total >= settings->max_parts_in_total) { ProfileEvents::increment(ProfileEvents::RejectedInserts); throw Exception( @@ -4338,7 +4338,7 @@ void MergeTreeData::delayInsertOrThrowIfNeeded(Poco::Event * until, const Contex if (settings->inactive_parts_to_throw_insert > 0 || settings->inactive_parts_to_delay_insert > 0) outdated_parts_count_in_partition = getMaxOutdatedPartsCountForPartition(); - if (settings->inactive_parts_to_throw_insert > 0 && outdated_parts_count_in_partition >= settings->inactive_parts_to_throw_insert) + if (allow_throw && settings->inactive_parts_to_throw_insert > 0 && outdated_parts_count_in_partition >= settings->inactive_parts_to_throw_insert) { ProfileEvents::increment(ProfileEvents::RejectedInserts); throw Exception( @@ -4362,7 +4362,7 @@ void MergeTreeData::delayInsertOrThrowIfNeeded(Poco::Event * until, const Contex bool parts_are_large_enough_in_average = settings->max_avg_part_size_for_too_many_parts && average_part_size > settings->max_avg_part_size_for_too_many_parts; - if (parts_count_in_partition >= active_parts_to_throw_insert && !parts_are_large_enough_in_average) + if (allow_throw && parts_count_in_partition >= active_parts_to_throw_insert && !parts_are_large_enough_in_average) { ProfileEvents::increment(ProfileEvents::RejectedInserts); throw Exception( diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index b1e1e43bd0b..ebda82eeaed 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -557,7 +557,7 @@ public: /// If the table contains too many active parts, sleep for a while to give them time to merge. /// If until is non-null, wake up from the sleep earlier if the event happened. /// The decision to delay or throw is made according to settings 'parts_to_delay_insert' and 'parts_to_throw_insert'. - void delayInsertOrThrowIfNeeded(Poco::Event * until, const ContextPtr & query_context) const; + void delayInsertOrThrowIfNeeded(Poco::Event * until, const ContextPtr & query_context, bool allow_throw) const; /// If the table contains too many unfinished mutations, sleep for a while to give them time to execute. /// If until is non-null, wake up from the sleep earlier if the event happened. diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 33aea358078..082b84be575 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -73,11 +73,11 @@ struct Settings; M(UInt64, max_delay_to_mutate_ms, 1000, "Max delay of mutating MergeTree table in milliseconds, if there are a lot of unfinished mutations", 0) \ \ /** Inserts settings. */ \ - M(UInt64, parts_to_delay_insert, 150, "If table contains at least that many active parts in single partition, artificially slow down insert into table. Disabled if set to 0", 0) \ + M(UInt64, parts_to_delay_insert, 1000, "If table contains at least that many active parts in single partition, artificially slow down insert into table. Disabled if set to 0", 0) \ M(UInt64, inactive_parts_to_delay_insert, 0, "If table contains at least that many inactive parts in single partition, artificially slow down insert into table.", 0) \ - M(UInt64, parts_to_throw_insert, 300, "If more than this number active parts in single partition, throw 'Too many parts ...' exception.", 0) \ + M(UInt64, parts_to_throw_insert, 3000, "If more than this number active parts in single partition, throw 'Too many parts ...' exception.", 0) \ M(UInt64, inactive_parts_to_throw_insert, 0, "If more than this number inactive parts in single partition, throw 'Too many inactive parts ...' exception.", 0) \ - M(UInt64, max_avg_part_size_for_too_many_parts, 10ULL * 1024 * 1024 * 1024, "The 'too many parts' check according to 'parts_to_delay_insert' and 'parts_to_throw_insert' will be active only if the average part size (in the relevant partition) is not larger than the specified threshold. If it is larger than the specified threshold, the INSERTs will be neither delayed or rejected. This allows to have hundreds of terabytes in a single table on a single server if the parts are successfully merged to larger parts. This does not affect the thresholds on inactive parts or total parts.", 0) \ + M(UInt64, max_avg_part_size_for_too_many_parts, 1ULL * 1024 * 1024 * 1024, "The 'too many parts' check according to 'parts_to_delay_insert' and 'parts_to_throw_insert' will be active only if the average part size (in the relevant partition) is not larger than the specified threshold. If it is larger than the specified threshold, the INSERTs will be neither delayed or rejected. This allows to have hundreds of terabytes in a single table on a single server if the parts are successfully merged to larger parts. This does not affect the thresholds on inactive parts or total parts.", 0) \ M(UInt64, max_delay_to_insert, 1, "Max delay of inserting data into MergeTree table in seconds, if there are a lot of unmerged parts in single partition.", 0) \ M(UInt64, min_delay_to_insert_ms, 10, "Min delay of inserting data into MergeTree table in milliseconds, if there are a lot of unmerged parts in single partition.", 0) \ M(UInt64, max_parts_in_total, 100000, "If more than this number active parts in all partitions in total, throw 'Too many parts ...' exception.", 0) \ diff --git a/src/Storages/MergeTree/MergeTreeSink.cpp b/src/Storages/MergeTree/MergeTreeSink.cpp index d62fe5024f4..36816904a81 100644 --- a/src/Storages/MergeTree/MergeTreeSink.cpp +++ b/src/Storages/MergeTree/MergeTreeSink.cpp @@ -45,9 +45,9 @@ MergeTreeSink::MergeTreeSink( void MergeTreeSink::onStart() { - /// Only check "too many parts" before write, + /// It's only allowed to throw "too many parts" before write, /// because interrupting long-running INSERT query in the middle is not convenient for users. - storage.delayInsertOrThrowIfNeeded(nullptr, context); + storage.delayInsertOrThrowIfNeeded(nullptr, context, true); } void MergeTreeSink::onFinish() @@ -57,6 +57,9 @@ void MergeTreeSink::onFinish() void MergeTreeSink::consume(Chunk chunk) { + if (num_blocks_processed > 0) + storage.delayInsertOrThrowIfNeeded(nullptr, context, false); + auto block = getHeader().cloneWithColumns(chunk.detachColumns()); if (!storage_snapshot->object_columns.empty()) convertDynamicColumnsToTuples(block, storage_snapshot); @@ -136,6 +139,8 @@ void MergeTreeSink::consume(Chunk chunk) finishDelayedChunk(); delayed_chunk = std::make_unique(); delayed_chunk->partitions = std::move(partitions); + + ++num_blocks_processed; } void MergeTreeSink::finishDelayedChunk() diff --git a/src/Storages/MergeTree/MergeTreeSink.h b/src/Storages/MergeTree/MergeTreeSink.h index 68f11d86a25..07ab3850df2 100644 --- a/src/Storages/MergeTree/MergeTreeSink.h +++ b/src/Storages/MergeTree/MergeTreeSink.h @@ -35,7 +35,8 @@ private: size_t max_parts_per_block; ContextPtr context; StorageSnapshotPtr storage_snapshot; - uint64_t chunk_dedup_seqnum = 0; /// input chunk ordinal number in case of dedup token + UInt64 chunk_dedup_seqnum = 0; /// input chunk ordinal number in case of dedup token + UInt64 num_blocks_processed = 0; /// We can delay processing for previous chunk and start writing a new one. struct DelayedChunk; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 28dad454afe..5fbd72ccddc 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -367,6 +367,9 @@ size_t ReplicatedMergeTreeSinkImpl::checkQuorumPrecondition(const template void ReplicatedMergeTreeSinkImpl::consume(Chunk chunk) { + if (num_blocks_processed > 0) + storage.delayInsertOrThrowIfNeeded(&storage.partial_shutdown_event, context, false); + auto block = getHeader().cloneWithColumns(chunk.detachColumns()); const auto & settings = context->getSettingsRef(); @@ -512,6 +515,8 @@ void ReplicatedMergeTreeSinkImpl::consume(Chunk chunk) /// TODO: we can also delay commit if there is no MVs. if (!settings.deduplicate_blocks_in_dependent_materialized_views) finishDelayedChunk(zookeeper); + + ++num_blocks_processed; } template<> @@ -1136,9 +1141,9 @@ std::pair, bool> ReplicatedMergeTreeSinkImpl:: template void ReplicatedMergeTreeSinkImpl::onStart() { - /// Only check "too many parts" before write, + /// It's only allowed to throw "too many parts" before write, /// because interrupting long-running INSERT query in the middle is not convenient for users. - storage.delayInsertOrThrowIfNeeded(&storage.partial_shutdown_event, context); + storage.delayInsertOrThrowIfNeeded(&storage.partial_shutdown_event, context, true); } template diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h index 8d9e2e14129..868590efa25 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h @@ -123,6 +123,7 @@ private: bool quorum_parallel = false; const bool deduplicate = true; bool last_block_is_duplicate = false; + UInt64 num_blocks_processed = 0; using Logger = Poco::Logger; Poco::Logger * log; From 61fa6944145f851e156943db678e0657a0d6fb42 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Sun, 11 Jun 2023 19:03:57 +0000 Subject: [PATCH 0763/1072] Fix boundery -> boundary in docs --- .../sql-reference/functions/type-conversion-functions.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 28db7e6e677..9258b6d6026 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -317,7 +317,7 @@ SELECT ## toDateOrZero -The same as [toDate](#todate) but returns lower boundery of [Date](/docs/en/sql-reference/data-types/date.md) if an invalid argument is received. Only [String](/docs/en/sql-reference/data-types/string.md) argument is supported. +The same as [toDate](#todate) but returns lower boundary of [Date](/docs/en/sql-reference/data-types/date.md) if an invalid argument is received. Only [String](/docs/en/sql-reference/data-types/string.md) argument is supported. **Example** @@ -359,7 +359,7 @@ Result: ## toDateOrDefault -Like [toDate](#todate) but if unsuccessful, returns a default value which is either the second argument (if specified), or otherwise the lower boundery of [Date](/docs/en/sql-reference/data-types/date.md). +Like [toDate](#todate) but if unsuccessful, returns a default value which is either the second argument (if specified), or otherwise the lower boundary of [Date](/docs/en/sql-reference/data-types/date.md). **Syntax** @@ -424,7 +424,7 @@ Result: ## toDateTimeOrZero -The same as [toDateTime](#todatetime) but returns lower boundery of [DateTime](/docs/en/sql-reference/data-types/datetime.md) if an invalid argument is received. Only [String](/docs/en/sql-reference/data-types/string.md) argument is supported. +The same as [toDateTime](#todatetime) but returns lower boundary of [DateTime](/docs/en/sql-reference/data-types/datetime.md) if an invalid argument is received. Only [String](/docs/en/sql-reference/data-types/string.md) argument is supported. **Example** @@ -466,7 +466,7 @@ Result: ## toDateTimeOrDefault -Like [toDateTime](#todatetime) but if unsuccessful, returns a default value which is either the third argument (if specified), or otherwise the lower boundery of [DateTime](/docs/en/sql-reference/data-types/datetime.md). +Like [toDateTime](#todatetime) but if unsuccessful, returns a default value which is either the third argument (if specified), or otherwise the lower boundary of [DateTime](/docs/en/sql-reference/data-types/datetime.md). **Syntax** From 3797a4202cb3da6bf20684149d24931e72fbd239 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Sun, 11 Jun 2023 23:38:31 +0300 Subject: [PATCH 0764/1072] Update MergeTreeData.cpp --- src/Storages/MergeTree/MergeTreeData.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 23351423d49..8a69c8ff75c 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1252,6 +1252,10 @@ MergeTreeData::LoadPartResult MergeTreeData::loadDataPart( mark_broken(); return res; } + catch (const Poco::NetException &) + { + throw; + } catch (const Poco::TimeoutException &) { throw; From 7bcaf8b233e91a11fe0d82daaf265d20c8279906 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Mon, 12 Jun 2023 10:15:32 +0800 Subject: [PATCH 0765/1072] fix build error --- src/Storages/StorageRedis.cpp | 6 +++++- src/Storages/StorageRedis.h | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index 97f1dbce6da..68c71cac508 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -461,7 +461,11 @@ Block StorageRedis::getSampleBlock(const Names &) const return getInMemoryMetadataPtr()->getSampleBlock(); } -SinkToStoragePtr StorageRedis::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr /*context*/) +SinkToStoragePtr StorageRedis::write( + const ASTPtr & /*query*/, + const StorageMetadataPtr & metadata_snapshot, + ContextPtr /*context*/, + bool /*async_insert*/) { return std::make_shared(*this, metadata_snapshot); } diff --git a/src/Storages/StorageRedis.h b/src/Storages/StorageRedis.h index a4ab9a6aa4e..a525a4ed7de 100644 --- a/src/Storages/StorageRedis.h +++ b/src/Storages/StorageRedis.h @@ -36,7 +36,8 @@ public: SinkToStoragePtr write( const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, - ContextPtr context) override; + ContextPtr context, + bool /*async_insert*/) override; void truncate(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, From a9d4d5194972daa1dcecf80b388c6ccb127f92d0 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Mon, 12 Jun 2023 10:16:02 +0800 Subject: [PATCH 0766/1072] add word redis to aspell-dict --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index a01b67b26b1..e594962ec44 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -2044,6 +2044,7 @@ reconnection recurse redash reddit +redis redisstreams refcounter regexpExtract From ef40c029a5617b9abfee6fb4525de9aeca2fcf73 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Mon, 12 Jun 2023 11:54:42 +0800 Subject: [PATCH 0767/1072] fix style --- src/Storages/StorageRedis.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index 68c71cac508..71c84443d8e 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -462,7 +462,7 @@ Block StorageRedis::getSampleBlock(const Names &) const } SinkToStoragePtr StorageRedis::write( - const ASTPtr & /*query*/, + const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr /*context*/, bool /*async_insert*/) From 911f8ad8dc68d126cedebd3e990ea185ed3c41b1 Mon Sep 17 00:00:00 2001 From: kevinyhzou Date: Mon, 12 Jun 2023 11:57:52 +0800 Subject: [PATCH 0768/1072] use whitespace or tab as field delimiter --- docs/en/interfaces/formats.md | 1 + .../operations/settings/settings-formats.md | 32 +++++++++++++++++ src/Core/Settings.h | 2 +- src/Formats/FormatFactory.cpp | 2 +- src/Formats/FormatSettings.h | 2 +- .../Formats/Impl/CSVRowInputFormat.cpp | 34 +++++++++---------- ...h_whitespace_tab_field_delimiter.reference | 4 +-- ...ext_with_whitespace_tab_field_delimiter.sh | 4 +-- 8 files changed, 57 insertions(+), 24 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index f19fd94dcd8..57962c1d730 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -468,6 +468,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe - [input_format_csv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_first_lines) - skip the specified number of lines at the beginning of data. Default value - `0`. - [input_format_csv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_csv_detect_header) - automatically detect header with names and types in CSV format. Default value - `true`. - [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`. +- [input_format_csv_use_whitespace_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_use_whitespace_tab_as_delimiter) - use whitespace or tab as field delimiter in CSV strings. Default value - `false`. ## CSVWithNames {#csvwithnames} diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 025e9f889f3..0e30c8f319e 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -914,6 +914,38 @@ Result " string " ``` +### input_format_csv_use_whitespace_tab_as_delimiter {#input_format_csv_use_whitespace_tab_as_delimiter} + +Use whitespace or tab as field delimiter in CSV strings. + +Default value: `false`. + +**Examples** + +Query + +```bash +echo 'a b' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_use_whitespace_tab_as_delimiter=true --format_csv_delimiter=' ' +``` + +Result + +```text +a b +``` + +Query + +```bash +echo 'a b' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_use_whitespace_tab_as_delimiter=true --format_csv_delimiter='\t' +``` + +Result + +```text +a b +``` + ## Values format settings {#values-format-settings} ### input_format_values_interpret_expressions {#input_format_values_interpret_expressions} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 45641e76689..4306ac855a3 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -850,7 +850,7 @@ class IColumn; M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \ M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \ M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \ - M(Bool, input_format_csv_skip_whitespaces_tabs, true, "Skips spaces and tabs(\\t) characters in the CSV strings", 0) \ + M(Bool, input_format_csv_use_whitespace_tab_as_delimiter, false, "Use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \ M(Bool, input_format_csv_trim_whitespaces, true, "Trims spaces and tabs (\\t) characters at the beginning and end in CSV strings", 0) \ M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \ M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 73a7d4f73f2..33ecddfc223 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -70,7 +70,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.skip_first_lines = settings.input_format_csv_skip_first_lines; format_settings.csv.try_detect_header = settings.input_format_csv_detect_header; format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces; - format_settings.csv.skip_whitespaces_tabs = settings.input_format_csv_skip_whitespaces_tabs; + format_settings.csv.use_whitespace_tab_as_delimiter = settings.input_format_csv_use_whitespace_tab_as_delimiter; format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter; format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter; format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 434389e31a1..72d60e8423e 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -137,7 +137,7 @@ struct FormatSettings String custom_delimiter; bool try_detect_header = true; bool trim_whitespaces = true; - bool skip_whitespaces_tabs = true; + bool use_whitespace_tab_as_delimiter = false; } csv; struct HiveText diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 4094285e1ad..b8d3413f863 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -25,10 +25,10 @@ namespace ErrorCodes namespace { - void checkBadDelimiter(char delimiter, bool skip_whitespaces_tabs) + void checkBadDelimiter(char delimiter, bool use_whitespace_tab_as_delimiter) { constexpr std::string_view bad_delimiters = " \t\"'.UL"; - if (bad_delimiters.find(delimiter) != std::string_view::npos && skip_whitespaces_tabs) + if (bad_delimiters.find(delimiter) != std::string_view::npos && !use_whitespace_tab_as_delimiter) throw Exception( ErrorCodes::BAD_ARGUMENTS, "CSV format may not work correctly with delimiter '{}'. Try use CustomSeparated format instead", @@ -68,7 +68,7 @@ CSVRowInputFormat::CSVRowInputFormat( format_settings_.csv.try_detect_header), buf(std::move(in_)) { - checkBadDelimiter(format_settings_.csv.delimiter, format_settings_.csv.skip_whitespaces_tabs); + checkBadDelimiter(format_settings_.csv.delimiter, format_settings_.csv.use_whitespace_tab_as_delimiter); } CSVRowInputFormat::CSVRowInputFormat( @@ -90,7 +90,7 @@ CSVRowInputFormat::CSVRowInputFormat( format_settings_.csv.try_detect_header), buf(std::move(in_)) { - checkBadDelimiter(format_settings_.csv.delimiter, format_settings_.csv.skip_whitespaces_tabs); + checkBadDelimiter(format_settings_.csv.delimiter, format_settings_.csv.use_whitespace_tab_as_delimiter); } void CSVRowInputFormat::syncAfterError() @@ -134,9 +134,9 @@ static void skipEndOfLine(ReadBuffer & in) } /// Skip `whitespace` symbols allowed in CSV. -static inline void skipWhitespacesAndTabs(ReadBuffer & in, const bool & skip_whitespaces_tabs) +static inline void skipWhitespacesAndTabs(ReadBuffer & in, const bool & use_whitespace_tab_as_delimiter) { - if (!skip_whitespaces_tabs) + if (use_whitespace_tab_as_delimiter) { return; } @@ -150,7 +150,7 @@ CSVFormatReader::CSVFormatReader(PeekableReadBuffer & buf_, const FormatSettings void CSVFormatReader::skipFieldDelimiter() { - skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); + skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); assertChar(format_settings.csv.delimiter, *buf); } @@ -158,7 +158,7 @@ template String CSVFormatReader::readCSVFieldIntoString() { if (format_settings.csv.trim_whitespaces) [[likely]] - skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); + skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); String field; if constexpr (read_string) @@ -170,14 +170,14 @@ String CSVFormatReader::readCSVFieldIntoString() void CSVFormatReader::skipField() { - skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); + skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); NullOutput out; readCSVStringInto(out, *buf, format_settings.csv); } void CSVFormatReader::skipRowEndDelimiter() { - skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); + skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); if (buf->eof()) return; @@ -186,7 +186,7 @@ void CSVFormatReader::skipRowEndDelimiter() if (*buf->position() == format_settings.csv.delimiter) ++buf->position(); - skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); + skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); if (buf->eof()) return; @@ -198,7 +198,7 @@ void CSVFormatReader::skipHeaderRow() do { skipField(); - skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); + skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); } while (checkChar(format_settings.csv.delimiter, *buf)); skipRowEndDelimiter(); @@ -211,7 +211,7 @@ std::vector CSVFormatReader::readRowImpl() do { fields.push_back(readCSVFieldIntoString()); - skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); + skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); } while (checkChar(format_settings.csv.delimiter, *buf)); skipRowEndDelimiter(); @@ -224,7 +224,7 @@ bool CSVFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) try { - skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); + skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); assertChar(delimiter, *buf); } catch (const DB::Exception &) @@ -250,7 +250,7 @@ bool CSVFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { - skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); + skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); if (buf->eof()) return true; @@ -259,7 +259,7 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) if (*buf->position() == format_settings.csv.delimiter) { ++buf->position(); - skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); + skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); if (buf->eof()) return true; } @@ -287,7 +287,7 @@ bool CSVFormatReader::readField( const String & /*column_name*/) { if (format_settings.csv.trim_whitespaces || !isStringOrFixedString(removeNullable(type))) [[likely]] - skipWhitespacesAndTabs(*buf, format_settings.csv.skip_whitespaces_tabs); + skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); const bool at_delimiter = !buf->eof() && *buf->position() == format_settings.csv.delimiter; const bool at_last_column_line_end = is_last_file_column && (buf->eof() || *buf->position() == '\n' || *buf->position() == '\r'); diff --git a/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.reference b/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.reference index 531391394a7..228436130dc 100644 --- a/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.reference +++ b/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.reference @@ -1,2 +1,2 @@ -1 a b -2 c d +1 a b +2 c d diff --git a/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.sh b/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.sh index e3f61262674..deb6e317aac 100755 --- a/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.sh +++ b/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.sh @@ -10,8 +10,8 @@ $CLICKHOUSE_CLIENT -q "drop table if exists test_whitespace" $CLICKHOUSE_CLIENT -q "drop table if exists test_tab" $CLICKHOUSE_CLIENT -q "create table test_whitespace (x UInt32, y String, z String) engine=MergeTree order by x" $CLICKHOUSE_CLIENT -q "create table test_tab (x UInt32, y String, z String) engine=MergeTree order by x" -cat $CURDIR/data_csv/csv_with_space_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_whitespace SETTINGS format_csv_delimiter=' ', input_format_csv_skip_whitespaces_tabs=false FORMAT CSV" -cat $CURDIR/data_csv/csv_with_tab_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tab SETTINGS format_csv_delimiter='\t', input_format_csv_skip_whitespaces_tabs=false FORMAT CSV" +cat $CURDIR/data_csv/csv_with_space_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_whitespace SETTINGS format_csv_delimiter=' ', input_format_csv_use_whitespace_tab_as_delimiter=true FORMAT CSV" +cat $CURDIR/data_csv/csv_with_tab_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tab SETTINGS format_csv_delimiter='\t', input_format_csv_use_whitespace_tab_as_delimiter=true FORMAT CSV" $CLICKHOUSE_CLIENT -q "select * from test_whitespace" $CLICKHOUSE_CLIENT -q "select * from test_tab" $CLICKHOUSE_CLIENT -q "drop table test_whitespace" From 1af062a53214168345838f79cba53ecb11cbc41e Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 12 Jun 2023 08:04:55 +0000 Subject: [PATCH 0769/1072] Un-flake 00804_test_deflate_qpl_codec_compression --- .../0_stateless/00804_test_deflate_qpl_codec_compression.sql | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql index ff3c1812c86..8a256567e80 100644 --- a/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql +++ b/tests/queries/0_stateless/00804_test_deflate_qpl_codec_compression.sql @@ -6,6 +6,10 @@ SET enable_deflate_qpl_codec = 1; +-- Suppress test failures because stderr contains warning "Initialization of hardware-assisted DeflateQpl failed, falling +-- back to software DeflateQpl coded." +SET send_logs_level = 'fatal'; + DROP TABLE IF EXISTS compression_codec; CREATE TABLE compression_codec( From c378c3fcbbb678e96f5bc11958295f4dd1b4b6ba Mon Sep 17 00:00:00 2001 From: Julian Maicher Date: Mon, 12 Jun 2023 10:29:46 +0200 Subject: [PATCH 0770/1072] Fix type of LDAP server params hash in cache entry In 1ed7ad57d91db198ca94085a2c56372fb813543a, we switched from (`size_t`, usually 64bit) to SipHash (128bit) and forgot to change the type of the cache entry. This broke the caching of successful LDAP authentication requests (verification cooldown). Fixes #50864 --- src/Access/ExternalAuthenticators.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Access/ExternalAuthenticators.h b/src/Access/ExternalAuthenticators.h index bf928c18d5b..7b47c9351fd 100644 --- a/src/Access/ExternalAuthenticators.h +++ b/src/Access/ExternalAuthenticators.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -42,7 +43,7 @@ public: private: struct LDAPCacheEntry { - std::size_t last_successful_params_hash = 0; + UInt128 last_successful_params_hash = 0; std::chrono::steady_clock::time_point last_successful_authentication_timestamp; LDAPClient::SearchResultsList last_successful_role_search_results; }; From 6b2c33b1e478f175bec55e41a7ca8054807bb4fa Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 12 Jun 2023 09:16:22 +0000 Subject: [PATCH 0771/1072] Document x86 / ARM prerequisites for Docker image --- docker/server/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docker/server/README.md b/docker/server/README.md index e6c9ee51fa7..46d30f252b4 100644 --- a/docker/server/README.md +++ b/docker/server/README.md @@ -16,6 +16,10 @@ For more information and documentation see https://clickhouse.com/. - The tag `head` is built from the latest commit to the default branch. - Each tag has optional `-alpine` suffix to reflect that it's built on top of `alpine`. +Compatibility +- The amd64 image requires support for [SSE3 instructions](https://en.wikipedia.org/wiki/SSE3). Virtually all x86 CPUs after 2005 support SSE3. +- The arm64 image requires support for the [ARMv8.2-A architecture](https://en.wikipedia.org/wiki/AArch64#ARMv8.2-A). Most ARM CPUs after 2017 support ARMv8.2-A. A notable exception is Raspberry Pi 4 from 2019 whose CPU only supports ARMv8.0-A. + ## How to use this image ### start server instance From a7408170c8bce6be3d4849fc3614834d8f646298 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 12 Jun 2023 11:21:43 +0200 Subject: [PATCH 0772/1072] Use H3 --- docker/server/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/server/README.md b/docker/server/README.md index 46d30f252b4..67646a262f5 100644 --- a/docker/server/README.md +++ b/docker/server/README.md @@ -16,7 +16,8 @@ For more information and documentation see https://clickhouse.com/. - The tag `head` is built from the latest commit to the default branch. - Each tag has optional `-alpine` suffix to reflect that it's built on top of `alpine`. -Compatibility +### Compatibility + - The amd64 image requires support for [SSE3 instructions](https://en.wikipedia.org/wiki/SSE3). Virtually all x86 CPUs after 2005 support SSE3. - The arm64 image requires support for the [ARMv8.2-A architecture](https://en.wikipedia.org/wiki/AArch64#ARMv8.2-A). Most ARM CPUs after 2017 support ARMv8.2-A. A notable exception is Raspberry Pi 4 from 2019 whose CPU only supports ARMv8.0-A. From 676ba2fbde78ec9ada09a45c0453e0cd96a3ab01 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 12 Jun 2023 12:30:38 +0300 Subject: [PATCH 0773/1072] Update MergeTreeData.cpp --- src/Storages/MergeTree/MergeTreeData.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 8a69c8ff75c..ee06056985a 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -99,6 +99,7 @@ #include #include +#include template <> struct fmt::formatter : fmt::formatter From c85344f83b8c1568e67ef45fdcb55b0ec0c07a8b Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Mon, 12 Jun 2023 10:02:44 +0000 Subject: [PATCH 0774/1072] Try to fix test (becouse timezone randomization) --- .../0_stateless/01746_convert_type_with_default.reference | 2 +- tests/queries/0_stateless/01746_convert_type_with_default.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.reference b/tests/queries/0_stateless/01746_convert_type_with_default.reference index 541580d67f5..e5aa42e6116 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.reference +++ b/tests/queries/0_stateless/01746_convert_type_with_default.reference @@ -40,7 +40,7 @@ 1970-01-20 1970-01-20 2149-06-06 -1970-01-01 +1970-01-02 2023-05-30 2023-05-30 2023-05-30 14:38:20 diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.sql b/tests/queries/0_stateless/01746_convert_type_with_default.sql index 2620780cfb9..e6e420ae4c0 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.sql +++ b/tests/queries/0_stateless/01746_convert_type_with_default.sql @@ -54,7 +54,7 @@ select toDateOrDefault(cast(19 as Int256)); select toDateOrDefault(cast(19 as UInt256)); select toDateOrDefault(65535); -select toDateOrDefault(65536); +select toDateOrDefault(122400); select toDateOrDefault(19507, '2000-01-01'::Date); select toDateOrDefault(-1, '2023-05-30'::Date); From 5cec4c3161b84e32341ef723dc8cea2b38343b69 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 12 Jun 2023 11:34:40 +0000 Subject: [PATCH 0775/1072] Fallback to parsing big integer from String instead of exception in Parquet format --- src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp | 9 +++------ .../02786_parquet_big_integer_compatibility.reference | 1 + .../02786_parquet_big_integer_compatibility.sh | 9 +++++++++ 3 files changed, 13 insertions(+), 6 deletions(-) create mode 100644 tests/queries/0_stateless/02786_parquet_big_integer_compatibility.reference create mode 100755 tests/queries/0_stateless/02786_parquet_big_integer_compatibility.sh diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 0b4700c9d4c..5a7306111a5 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -202,13 +202,10 @@ static ColumnWithTypeAndName readColumnWithBigNumberFromBinaryData(std::shared_p for (size_t i = 0; i != chunk_length; ++i) { + /// If at least one value size is not equal to the size if big integer, fallback to reading String column and further cast to result type. if (!chunk.IsNull(i) && chunk.value_length(i) != sizeof(ValueType)) - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Cannot insert data into {} column from binary value, expected data with size {}, got {}", - column_type->getName(), - sizeof(ValueType), - chunk.value_length(i)); + return readColumnWithStringData(arrow_column, column_name); + total_size += chunk_length; } } diff --git a/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.reference b/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.reference new file mode 100644 index 00000000000..7764974255b --- /dev/null +++ b/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.reference @@ -0,0 +1 @@ +424242424242424242424242424242424242424242424242424242 diff --git a/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.sh b/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.sh new file mode 100755 index 00000000000..8865b2e7aab --- /dev/null +++ b/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_LOCAL -q "select toString(424242424242424242424242424242424242424242424242424242::UInt256) as x format Parquet" | $CLICKHOUSE_LOCAL --input-format=Parquet --structure='x UInt256' -q "select * from table" + From 24d70a2afd70a10a709fffe942b4e759d406f93b Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 12 Jun 2023 13:37:59 +0200 Subject: [PATCH 0776/1072] Fix --- src/Formats/CapnProtoSerializer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp index f51f8c4b933..b306cca4f94 100644 --- a/src/Formats/CapnProtoSerializer.cpp +++ b/src/Formats/CapnProtoSerializer.cpp @@ -403,7 +403,7 @@ namespace if (it == capnp_to_ch_values.end()) throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected value {} in CapnProto enum", capnp_enum_value); - assert_cast &>(column).insertValue(capnp_to_ch_values[capnp_enum_value]); + assert_cast &>(column).insertValue(it->second); } } From 42393b51ee1747e838c71196a29eb305fca6257c Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 12 Jun 2023 13:45:00 +0200 Subject: [PATCH 0777/1072] Fix style --- tests/clickhouse-test | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 56cf2f0ce0f..9242ca8a0b0 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -349,10 +349,11 @@ def kill_gdb_if_any(): for i in range(5): code = subprocess.call("kill -TERM $(pidof gdb)", shell=True, timeout=30) if code != 0: - time.sleep(i) + sleep(i) else: break + # collect server stacktraces using gdb def get_stacktraces_from_gdb(server_pid): try: From 17a6512cdc80c0a14f4570c12df520deff05550c Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 12 Jun 2023 13:53:20 +0200 Subject: [PATCH 0778/1072] Delete bad test --- tests/queries/0_stateless/02782_bitmap_overflow.sql | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 tests/queries/0_stateless/02782_bitmap_overflow.sql diff --git a/tests/queries/0_stateless/02782_bitmap_overflow.sql b/tests/queries/0_stateless/02782_bitmap_overflow.sql deleted file mode 100644 index 71ddce5c3b9..00000000000 --- a/tests/queries/0_stateless/02782_bitmap_overflow.sql +++ /dev/null @@ -1,4 +0,0 @@ --- Tags: no-msan, no-asan - -select unhex('0181808080908380808000')::AggregateFunction(groupBitmap, UInt64); -- {serverError TOO_LARGE_ARRAY_SIZE} - From d100a2031cd51f51e7320d62f683e7bf8520083c Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 12 Jun 2023 13:53:44 +0200 Subject: [PATCH 0779/1072] Delete bad test --- tests/queries/0_stateless/02782_bitmap_overflow.reference | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/queries/0_stateless/02782_bitmap_overflow.reference diff --git a/tests/queries/0_stateless/02782_bitmap_overflow.reference b/tests/queries/0_stateless/02782_bitmap_overflow.reference deleted file mode 100644 index e69de29bb2d..00000000000 From 1c8371e1db7c1bfbc9a98d2cf33b1450e5c3547a Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 12 Jun 2023 12:13:10 +0000 Subject: [PATCH 0780/1072] Update version_date.tsv and changelogs after v22.8.18.31-lts --- docs/changelogs/v22.8.18.31-lts.md | 32 ++++++++++++++++++++++++++++ utils/list-versions/version_date.tsv | 1 + 2 files changed, 33 insertions(+) create mode 100644 docs/changelogs/v22.8.18.31-lts.md diff --git a/docs/changelogs/v22.8.18.31-lts.md b/docs/changelogs/v22.8.18.31-lts.md new file mode 100644 index 00000000000..709bb926f8a --- /dev/null +++ b/docs/changelogs/v22.8.18.31-lts.md @@ -0,0 +1,32 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v22.8.18.31-lts (4de7a95a544) FIXME as compared to v22.8.17.17-lts (df7f2ef0b41) + +#### Performance Improvement +* Backported in [#49214](https://github.com/ClickHouse/ClickHouse/issues/49214): Fixed excessive reading in queries with `FINAL`. [#47801](https://github.com/ClickHouse/ClickHouse/pull/47801) ([Nikita Taranov](https://github.com/nickitat)). + +#### Build/Testing/Packaging Improvement +* Backported in [#49079](https://github.com/ClickHouse/ClickHouse/issues/49079): Update time zones. The following were updated: Africa/Cairo, Africa/Casablanca, Africa/El_Aaiun, America/Bogota, America/Cambridge_Bay, America/Ciudad_Juarez, America/Godthab, America/Inuvik, America/Iqaluit, America/Nuuk, America/Ojinaga, America/Pangnirtung, America/Rankin_Inlet, America/Resolute, America/Whitehorse, America/Yellowknife, Asia/Gaza, Asia/Hebron, Asia/Kuala_Lumpur, Asia/Singapore, Canada/Yukon, Egypt, Europe/Kirov, Europe/Volgograd, Singapore. [#48572](https://github.com/ClickHouse/ClickHouse/pull/48572) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix bad cast from LowCardinality column when using short circuit function execution [#43311](https://github.com/ClickHouse/ClickHouse/pull/43311) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix msan issue in randomStringUTF8() [#49750](https://github.com/ClickHouse/ClickHouse/pull/49750) ([Robert Schulze](https://github.com/rschu1ze)). +* JIT compilation not equals NaN fix [#50056](https://github.com/ClickHouse/ClickHouse/pull/50056) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix crash with `multiIf` and constant condition and nullable arguments [#50123](https://github.com/ClickHouse/ClickHouse/pull/50123) ([Anton Popov](https://github.com/CurtizJ)). +* Fixed type conversion from Date/Date32 to DateTime64 when querying with DateTime64 index [#50280](https://github.com/ClickHouse/ClickHouse/pull/50280) ([Lucas Chang](https://github.com/lucas-tubi)). +* Fix Keeper deadlock on exception when preprocessing requests. [#50387](https://github.com/ClickHouse/ClickHouse/pull/50387) ([frinkr](https://github.com/frinkr)). +* Fix Log family table return wrong rows count after truncate [#50585](https://github.com/ClickHouse/ClickHouse/pull/50585) ([flynn](https://github.com/ucasfl)). +* Do not read all the columns from right GLOBAL JOIN table. [#50721](https://github.com/ClickHouse/ClickHouse/pull/50721) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Improve test reports [#49151](https://github.com/ClickHouse/ClickHouse/pull/49151) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Update github.com/distribution/distribution [#50114](https://github.com/ClickHouse/ClickHouse/pull/50114) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Catch issues with dockerd during the build [#50700](https://github.com/ClickHouse/ClickHouse/pull/50700) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 4647bcb4af1..dce6aadbff4 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -44,6 +44,7 @@ v22.9.4.32-stable 2022-10-26 v22.9.3.18-stable 2022-09-30 v22.9.2.7-stable 2022-09-23 v22.9.1.2603-stable 2022-09-22 +v22.8.18.31-lts 2023-06-12 v22.8.17.17-lts 2023-04-22 v22.8.16.32-lts 2023-04-04 v22.8.15.23-lts 2023-03-10 From d177cfceca3af796f7e5cab3adb421212d9856f0 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 12 Jun 2023 12:17:11 +0000 Subject: [PATCH 0781/1072] Update version_date.tsv and changelogs after v23.3.3.52-lts --- docs/changelogs/v23.3.3.52-lts.md | 45 ++++++++++++++++++++++++++++ utils/list-versions/version_date.tsv | 2 ++ 2 files changed, 47 insertions(+) create mode 100644 docs/changelogs/v23.3.3.52-lts.md diff --git a/docs/changelogs/v23.3.3.52-lts.md b/docs/changelogs/v23.3.3.52-lts.md new file mode 100644 index 00000000000..f845e14eb78 --- /dev/null +++ b/docs/changelogs/v23.3.3.52-lts.md @@ -0,0 +1,45 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.3.3.52-lts (cb963c474db) FIXME as compared to v23.3.2.37-lts (1b144bcd101) + +#### Improvement +* Backported in [#49954](https://github.com/ClickHouse/ClickHouse/issues/49954): Add support for (an unusual) case where the arguments in the `IN` operator are single-element tuples. [#49844](https://github.com/ClickHouse/ClickHouse/pull/49844) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). + +#### Build/Testing/Packaging Improvement +* Backported in [#49210](https://github.com/ClickHouse/ClickHouse/issues/49210): Fix glibc compatibility check: replace `preadv` from musl. [#49144](https://github.com/ClickHouse/ClickHouse/pull/49144) ([alesapin](https://github.com/alesapin)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix aggregate empty string error [#48999](https://github.com/ClickHouse/ClickHouse/pull/48999) ([LiuNeng](https://github.com/liuneng1994)). +* Fix key not found error for queries with multiple StorageJoin [#49137](https://github.com/ClickHouse/ClickHouse/pull/49137) ([vdimir](https://github.com/vdimir)). +* Fix race on Outdated parts loading [#49223](https://github.com/ClickHouse/ClickHouse/pull/49223) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix bug in DISTINCT [#49628](https://github.com/ClickHouse/ClickHouse/pull/49628) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix msan issue in randomStringUTF8() [#49750](https://github.com/ClickHouse/ClickHouse/pull/49750) ([Robert Schulze](https://github.com/rschu1ze)). +* fix `is_prefix` in OptimizeRegularExpression [#49919](https://github.com/ClickHouse/ClickHouse/pull/49919) ([Han Fei](https://github.com/hanfei1991)). +* Fix IPv6 encoding in protobuf [#49933](https://github.com/ClickHouse/ClickHouse/pull/49933) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Avoid deadlock when starting table in attach thread of `ReplicatedMergeTree` [#50026](https://github.com/ClickHouse/ClickHouse/pull/50026) ([Antonio Andelic](https://github.com/antonio2368)). +* JIT compilation not equals NaN fix [#50056](https://github.com/ClickHouse/ClickHouse/pull/50056) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix crash with `multiIf` and constant condition and nullable arguments [#50123](https://github.com/ClickHouse/ClickHouse/pull/50123) ([Anton Popov](https://github.com/CurtizJ)). +* Fix reconnecting of HTTPS session when target host IP was changed [#50240](https://github.com/ClickHouse/ClickHouse/pull/50240) ([Aleksei Filatov](https://github.com/aalexfvk)). +* Fixed type conversion from Date/Date32 to DateTime64 when querying with DateTime64 index [#50280](https://github.com/ClickHouse/ClickHouse/pull/50280) ([Lucas Chang](https://github.com/lucas-tubi)). +* Fix Keeper deadlock on exception when preprocessing requests. [#50387](https://github.com/ClickHouse/ClickHouse/pull/50387) ([frinkr](https://github.com/frinkr)). +* Fix incorrect constant folding [#50536](https://github.com/ClickHouse/ClickHouse/pull/50536) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix Log family table return wrong rows count after truncate [#50585](https://github.com/ClickHouse/ClickHouse/pull/50585) ([flynn](https://github.com/ucasfl)). +* Fix bug in `uniqExact` parallel merging [#50590](https://github.com/ClickHouse/ClickHouse/pull/50590) ([Nikita Taranov](https://github.com/nickitat)). +* Do not read all the columns from right GLOBAL JOIN table. [#50721](https://github.com/ClickHouse/ClickHouse/pull/50721) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Implement status comment [#48468](https://github.com/ClickHouse/ClickHouse/pull/48468) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Update curl to 8.0.1 (for CVEs) [#48765](https://github.com/ClickHouse/ClickHouse/pull/48765) ([Boris Kuschel](https://github.com/bkuschel)). +* Improve test reports [#49151](https://github.com/ClickHouse/ClickHouse/pull/49151) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fallback auth gh api [#49314](https://github.com/ClickHouse/ClickHouse/pull/49314) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Improve CI: status commit, auth for get_gh_api [#49388](https://github.com/ClickHouse/ClickHouse/pull/49388) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Update github.com/distribution/distribution [#50114](https://github.com/ClickHouse/ClickHouse/pull/50114) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Catch issues with dockerd during the build [#50700](https://github.com/ClickHouse/ClickHouse/pull/50700) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 4647bcb4af1..411dcd81957 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -2,6 +2,7 @@ v23.5.2.7-stable 2023-06-10 v23.5.1.3174-stable 2023-06-09 v23.4.2.11-stable 2023-05-02 v23.4.1.1943-stable 2023-04-27 +v23.3.3.52-lts 2023-06-12 v23.3.2.37-lts 2023-04-22 v23.3.1.2823-lts 2023-03-31 v23.2.7.32-stable 2023-06-09 @@ -44,6 +45,7 @@ v22.9.4.32-stable 2022-10-26 v22.9.3.18-stable 2022-09-30 v22.9.2.7-stable 2022-09-23 v22.9.1.2603-stable 2022-09-22 +v22.8.18.31-lts 2023-06-12 v22.8.17.17-lts 2023-04-22 v22.8.16.32-lts 2023-04-04 v22.8.15.23-lts 2023-03-10 From 11fbc01de5bc5840f6d91c1e9b0d10bf25387bd9 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 12 Jun 2023 12:37:47 +0000 Subject: [PATCH 0782/1072] Update version_date.tsv and changelogs after v23.4.3.48-stable --- docs/changelogs/v23.4.3.48-stable.md | 42 ++++++++++++++++++++++++++++ utils/list-versions/version_date.tsv | 3 ++ 2 files changed, 45 insertions(+) create mode 100644 docs/changelogs/v23.4.3.48-stable.md diff --git a/docs/changelogs/v23.4.3.48-stable.md b/docs/changelogs/v23.4.3.48-stable.md new file mode 100644 index 00000000000..8bafd22bfbd --- /dev/null +++ b/docs/changelogs/v23.4.3.48-stable.md @@ -0,0 +1,42 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.4.3.48-stable (d9199f8d3cc) FIXME as compared to v23.4.2.11-stable (b6442320f9d) + +#### Backward Incompatible Change +* Backported in [#49981](https://github.com/ClickHouse/ClickHouse/issues/49981): Revert "`groupArray` returns cannot be nullable" (due to binary compatibility breakage for `groupArray`/`groupArrayLast`/`groupArraySample` over `Nullable` types, which likely will lead to `TOO_LARGE_ARRAY_SIZE` or `CANNOT_READ_ALL_DATA`). [#49971](https://github.com/ClickHouse/ClickHouse/pull/49971) ([Azat Khuzhin](https://github.com/azat)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix key not found error for queries with multiple StorageJoin [#49137](https://github.com/ClickHouse/ClickHouse/pull/49137) ([vdimir](https://github.com/vdimir)). +* Fix fuzz bug when subquery set is not built when reading from remote() [#49425](https://github.com/ClickHouse/ClickHouse/pull/49425) ([Alexander Gololobov](https://github.com/davenger)). +* Fix postgres database setting [#49481](https://github.com/ClickHouse/ClickHouse/pull/49481) ([Mal Curtis](https://github.com/snikch)). +* Fix AsynchronousReadIndirectBufferFromRemoteFS breaking on short seeks [#49525](https://github.com/ClickHouse/ClickHouse/pull/49525) ([Michael Kolupaev](https://github.com/al13n321)). +* Fix bug in DISTINCT [#49628](https://github.com/ClickHouse/ClickHouse/pull/49628) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix assert in SpanHolder::finish() with fibers [#49673](https://github.com/ClickHouse/ClickHouse/pull/49673) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix msan issue in randomStringUTF8() [#49750](https://github.com/ClickHouse/ClickHouse/pull/49750) ([Robert Schulze](https://github.com/rschu1ze)). +* fix `is_prefix` in OptimizeRegularExpression [#49919](https://github.com/ClickHouse/ClickHouse/pull/49919) ([Han Fei](https://github.com/hanfei1991)). +* Fix IPv6 encoding in protobuf [#49933](https://github.com/ClickHouse/ClickHouse/pull/49933) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Avoid deadlock when starting table in attach thread of `ReplicatedMergeTree` [#50026](https://github.com/ClickHouse/ClickHouse/pull/50026) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix assert in SpanHolder::finish() with fibers attempt 2 [#50034](https://github.com/ClickHouse/ClickHouse/pull/50034) ([Kruglov Pavel](https://github.com/Avogar)). +* JIT compilation not equals NaN fix [#50056](https://github.com/ClickHouse/ClickHouse/pull/50056) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix crashing in case of Replicated database without arguments [#50058](https://github.com/ClickHouse/ClickHouse/pull/50058) ([Azat Khuzhin](https://github.com/azat)). +* Fix crash with `multiIf` and constant condition and nullable arguments [#50123](https://github.com/ClickHouse/ClickHouse/pull/50123) ([Anton Popov](https://github.com/CurtizJ)). +* Fix iceberg metadata parsing [#50232](https://github.com/ClickHouse/ClickHouse/pull/50232) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix bugs in Poco sockets in non-blocking mode, use true non-blocking sockets [#50252](https://github.com/ClickHouse/ClickHouse/pull/50252) ([Kruglov Pavel](https://github.com/Avogar)). +* Fixed type conversion from Date/Date32 to DateTime64 when querying with DateTime64 index [#50280](https://github.com/ClickHouse/ClickHouse/pull/50280) ([Lucas Chang](https://github.com/lucas-tubi)). +* Fix Keeper deadlock on exception when preprocessing requests. [#50387](https://github.com/ClickHouse/ClickHouse/pull/50387) ([frinkr](https://github.com/frinkr)). +* Fix Log family table return wrong rows count after truncate [#50585](https://github.com/ClickHouse/ClickHouse/pull/50585) ([flynn](https://github.com/ucasfl)). +* Fix bug in `uniqExact` parallel merging [#50590](https://github.com/ClickHouse/ClickHouse/pull/50590) ([Nikita Taranov](https://github.com/nickitat)). +* Do not read all the columns from right GLOBAL JOIN table. [#50721](https://github.com/ClickHouse/ClickHouse/pull/50721) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Improve CI: status commit, auth for get_gh_api [#49388](https://github.com/ClickHouse/ClickHouse/pull/49388) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Update github.com/distribution/distribution [#50114](https://github.com/ClickHouse/ClickHouse/pull/50114) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Catch issues with dockerd during the build [#50700](https://github.com/ClickHouse/ClickHouse/pull/50700) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 4647bcb4af1..6afce99612f 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,7 +1,9 @@ v23.5.2.7-stable 2023-06-10 v23.5.1.3174-stable 2023-06-09 +v23.4.3.48-stable 2023-06-12 v23.4.2.11-stable 2023-05-02 v23.4.1.1943-stable 2023-04-27 +v23.3.3.52-lts 2023-06-12 v23.3.2.37-lts 2023-04-22 v23.3.1.2823-lts 2023-03-31 v23.2.7.32-stable 2023-06-09 @@ -44,6 +46,7 @@ v22.9.4.32-stable 2022-10-26 v22.9.3.18-stable 2022-09-30 v22.9.2.7-stable 2022-09-23 v22.9.1.2603-stable 2022-09-22 +v22.8.18.31-lts 2023-06-12 v22.8.17.17-lts 2023-04-22 v22.8.16.32-lts 2023-04-04 v22.8.15.23-lts 2023-03-10 From 5db3b393d825e5597f204b9ff2ca67abf89e4045 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 12 Jun 2023 16:22:33 +0300 Subject: [PATCH 0783/1072] Update MergeTreeData.cpp --- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index ee06056985a..5833955726f 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1253,7 +1253,7 @@ MergeTreeData::LoadPartResult MergeTreeData::loadDataPart( mark_broken(); return res; } - catch (const Poco::NetException &) + catch (const Poco::Net::NetException &) { throw; } From b5b8c7086b43fbf3de9293196bfb7097e3888b58 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 12 Jun 2023 13:43:53 +0000 Subject: [PATCH 0784/1072] Update broken tests list --- tests/broken_tests.txt | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index b1fa18c44dd..d49b4f391e5 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -11,7 +11,6 @@ 00927_asof_joins 00940_order_by_read_in_order_query_plan 00945_bloom_filter_index -00952_input_function 00979_set_index_not 00981_in_subquery_with_tuple 01049_join_low_card_bug_long @@ -21,14 +20,12 @@ 01072_optimize_skip_unused_shards_const_expr_eval 01083_expressions_in_engine_arguments 01086_odbc_roundtrip -01152_cross_replication 01155_rename_move_materialized_view 01173_transaction_control_queries 01211_optimize_skip_unused_shards_type_mismatch 01213_optimize_skip_unused_shards_DISTINCT 01214_test_storage_merge_aliases_with_where 01231_distributed_aggregation_memory_efficient_mix_levels -01232_extremes 01244_optimize_distributed_group_by_sharding_key 01247_optimize_distributed_group_by_sharding_key_dist_on_dist 01268_mv_scalars @@ -50,7 +47,6 @@ 01585_use_index_for_global_in 01585_use_index_for_global_in_with_null 01586_columns_pruning -01615_random_one_shard_insertion 01624_soft_constraints 01651_bugs_from_15889 01655_plan_optimizations @@ -79,7 +75,6 @@ 01952_optimize_distributed_group_by_sharding_key 02000_join_on_const 02001_shard_num_shard_count -02024_join_on_or_long 02131_used_row_policies_in_query_log 02139_MV_with_scalar_subquery 02174_cte_scalar_cache_mv @@ -88,14 +83,11 @@ 02302_s3_file_pruning 02317_distinct_in_order_optimization_explain 02341_global_join_cte -02343_aggregation_pipeline 02345_implicit_transaction -02346_additional_filters_distr 02352_grouby_shadows_arg 02354_annoy 02366_union_decimal_conversion 02375_rocksdb_with_filters -02377_optimize_sorting_by_input_stream_properties_explain 02382_join_and_filtering_set 02402_merge_engine_with_view 02404_memory_bound_merging @@ -112,7 +104,6 @@ 02575_merge_prewhere_different_default_kind 02713_array_low_cardinality_string 02707_skip_index_with_in -02324_map_combinator_bug 02241_join_rocksdb_bs 02003_WithMergeableStateAfterAggregationAndLimit_LIMIT_BY_LIMIT_OFFSET 01115_join_with_dictionary @@ -120,7 +111,6 @@ 00917_multiple_joins_denny_crane 00725_join_on_bug_1 00636_partition_key_parts_pruning -00261_storage_aliases_and_array_join 01825_type_json_multiple_files 01281_group_by_limit_memory_tracking 02723_zookeeper_name From 01c7d2fe719f9b9ed59fce58d5e9dec44167e42f Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 12 Jun 2023 16:53:26 +0300 Subject: [PATCH 0785/1072] Prostpone check of outdated parts (#50676) * prostpone check of outdated parts * Update ReplicatedMergeTreePartCheckThread.cpp --- .../ReplicatedMergeTreePartCheckThread.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp index 0882ff5a0bc..7bb8d9d758e 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp @@ -344,6 +344,22 @@ CheckResult ReplicatedMergeTreePartCheckThread::checkPart(const String & part_na LOG_TRACE(log, "Part {} in zookeeper: {}, locally: {}", part_name, exists_in_zookeeper, part != nullptr); + if (exists_in_zookeeper && !part) + { + auto outdated = storage.getPartIfExists(part_name, {MergeTreeDataPartState::Outdated, MergeTreeDataPartState::Deleting}); + if (outdated) + { + /// We cannot rely on exists_in_zookeeper, because the cleanup thread is probably going to remove it from ZooKeeper + /// Also, it will avoid "Cannot commit empty part: Part ... (state Outdated) already exists, but it will be deleted soon" + LOG_WARNING(log, "Part {} is Outdated, will wait for cleanup thread to handle it and check again later", part_name); + time_t lifetime = time(nullptr) - outdated->remove_time; + time_t max_lifetime = storage.getSettings()->old_parts_lifetime.totalSeconds(); + time_t delay = lifetime >= max_lifetime ? 0 : max_lifetime - lifetime; + enqueuePart(part_name, delay + 30); + return {part_name, true, "Part is Outdated, will recheck later"}; + } + } + /// We do not have this or a covering part. if (!part) { From d45f07743c3f27276740b3bac7200f7cad90292e Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 12 Jun 2023 13:54:07 +0000 Subject: [PATCH 0786/1072] fix getting number of mutations --- src/Storages/StorageMergeTree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index a2a46229660..233b37c74a9 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -1358,7 +1358,7 @@ size_t StorageMergeTree::getNumberOfUnfinishedMutations() const size_t count = 0; for (const auto & [version, _] : current_mutations_by_version | std::views::reverse) { - auto status = getIncompleteMutationsStatusUnlocked(version, lock); + auto status = getIncompleteMutationsStatusUnlocked(version, lock, nullptr, true); if (!status) continue; From 26c9bda14410e6f589c843a664635db9ab02b15e Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 12 Jun 2023 13:51:46 +0000 Subject: [PATCH 0787/1072] Add a comment --- src/Interpreters/InterpreterSelectQueryAnalyzer.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Interpreters/InterpreterSelectQueryAnalyzer.cpp b/src/Interpreters/InterpreterSelectQueryAnalyzer.cpp index 4f2f05dc7eb..8db1d27c073 100644 --- a/src/Interpreters/InterpreterSelectQueryAnalyzer.cpp +++ b/src/Interpreters/InterpreterSelectQueryAnalyzer.cpp @@ -135,6 +135,8 @@ QueryTreeNodePtr buildQueryTreeAndRunPasses(const ASTPtr & query, QueryTreePassManager query_tree_pass_manager(context); addQueryTreePasses(query_tree_pass_manager); + /// We should not apply any query tree level optimizations on shards + /// because it can lead to a changed header. if (select_query_options.ignore_ast_optimizations || context->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY) query_tree_pass_manager.run(query_tree, 1 /*up_to_pass_index*/); From 41ece306cf02519e97697d1a65bd3003d8cbe898 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 12 Jun 2023 13:53:46 +0000 Subject: [PATCH 0788/1072] Update broken_tests.txt --- tests/broken_tests.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index b1fa18c44dd..3b888223b78 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -106,7 +106,6 @@ 02458_use_structure_from_insertion_table 02479_race_condition_between_insert_and_droppin_mv 02493_inconsistent_hex_and_binary_number -02494_optimize_group_by_function_keys_and_alias_columns 02521_aggregation_by_partitions 02554_fix_grouping_sets_predicate_push_down 02575_merge_prewhere_different_default_kind From 07eb7b7d664778342e0b44049041c160ac868d94 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Mon, 12 Jun 2023 11:03:50 -0300 Subject: [PATCH 0789/1072] Update settings.md --- docs/en/operations/settings/settings.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 2c9679c940d..8104478deff 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1957,6 +1957,10 @@ Default value: empty string (disabled) For the replicated tables by default the only 100 of the most recent inserts for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md/#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)). For not replicated tables see [non_replicated_deduplication_window](merge-tree-settings.md/#non-replicated-deduplication-window). +:::note +`insert_deduplication_token` works on a partition level (the same as `insert_deduplication` checksum). Multiple partitions can have the same `insert_deduplication_token`. +::: + Example: ```sql From 252a10c670977c93b8808d8b98a8679714d6e9a3 Mon Sep 17 00:00:00 2001 From: tpanetti Date: Mon, 12 Jun 2023 08:19:06 -0700 Subject: [PATCH 0790/1072] Add "no-parallel" tag to MySQL Compatible Types test to fix test issue --- .../0_stateless/02775_show_columns_mysql_compatibility.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh index 51c9da2a842..e324926e2e7 100755 --- a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh +++ b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-fasttest +# Tags: no-fasttest, no-parallel CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From 5aa05667677669a17d6356fd884c7da35478d280 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 12 Jun 2023 17:24:34 +0200 Subject: [PATCH 0791/1072] Fix checking the lock file too often while writing a backup. --- src/Backups/BackupImpl.cpp | 6 ++---- src/Backups/BackupImpl.h | 1 + 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index 306236534b6..82793f44739 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -144,6 +144,7 @@ void BackupImpl::open(const ContextPtr & context) if (!uuid) uuid = UUIDHelpers::generateV4(); lock_file_name = use_archive ? (archive_params.archive_name + ".lock") : ".lock"; + lock_file_before_first_file_checked = false; writing_finalized = false; /// Check that we can write a backup there and create the lock file to own this destination. @@ -833,13 +834,10 @@ void BackupImpl::writeFile(const BackupFileInfo & info, BackupEntryPtr entry) if (writing_finalized) throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is already finalized"); - bool should_check_lock_file = false; { std::lock_guard lock{mutex}; ++num_files; total_size += info.size; - if (!num_entries) - should_check_lock_file = true; } auto src_disk = entry->getDisk(); @@ -859,7 +857,7 @@ void BackupImpl::writeFile(const BackupFileInfo & info, BackupEntryPtr entry) return; } - if (!should_check_lock_file) + if (!lock_file_before_first_file_checked.exchange(true)) checkLockFile(true); /// NOTE: `mutex` must be unlocked during copying otherwise writing will be in one thread maximum and hence slow. diff --git a/src/Backups/BackupImpl.h b/src/Backups/BackupImpl.h index 7e95d156162..3ab11228892 100644 --- a/src/Backups/BackupImpl.h +++ b/src/Backups/BackupImpl.h @@ -141,6 +141,7 @@ private: std::shared_ptr archive_reader; std::shared_ptr archive_writer; String lock_file_name; + std::atomic lock_file_before_first_file_checked = false; bool writing_finalized = false; bool deduplicate_files = true; From 65d83e45cb177cc3abfec088e31da44fca357c95 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 12 Jun 2023 16:21:28 +0000 Subject: [PATCH 0792/1072] Fix crash in snowflakeToDateTime(), follow-up to #50834 --- src/Functions/FunctionSnowflake.h | 18 +++++++++++++----- .../01942_snowflakeToDateTime.reference | 1 + .../0_stateless/01942_snowflakeToDateTime.sql | 4 +++- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/Functions/FunctionSnowflake.h b/src/Functions/FunctionSnowflake.h index ce3a48269b4..0a47534c47d 100644 --- a/src/Functions/FunctionSnowflake.h +++ b/src/Functions/FunctionSnowflake.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -110,12 +111,19 @@ public: auto res_column = ColumnUInt32::create(input_rows_count); auto & result_data = res_column->getData(); - const auto & source_data = typeid_cast(col).getData(); - - for (size_t i = 0; i < input_rows_count; ++i) + if (const auto * src_non_const = typeid_cast(&col)) { - result_data[i] = static_cast( - ((source_data[i] >> time_shift) + snowflake_epoch) / 1000); + const auto & source_data = src_non_const->getData(); + for (size_t i = 0; i < input_rows_count; ++i) + result_data[i] = static_cast( + ((source_data[i] >> time_shift) + snowflake_epoch) / 1000); + } + else if (const auto * src_const = typeid_cast(&col)) + { + Int64 src_val = src_const->getValue(); + for (size_t i = 0; i < input_rows_count; ++i) + result_data[i] = static_cast( + ((src_val >> time_shift) + snowflake_epoch) / 1000); } return res_column; } diff --git a/tests/queries/0_stateless/01942_snowflakeToDateTime.reference b/tests/queries/0_stateless/01942_snowflakeToDateTime.reference index bed18023f6a..fa00a22bc63 100644 --- a/tests/queries/0_stateless/01942_snowflakeToDateTime.reference +++ b/tests/queries/0_stateless/01942_snowflakeToDateTime.reference @@ -1,3 +1,4 @@ const column UTC 1426860704886947840 2021-08-15 10:57:56 DateTime(\'UTC\') 2021-08-15 10:57:56.492 DateTime64(3, \'UTC\') Asia/Shanghai 1426860704886947840 2021-08-15 18:57:56 DateTime(\'Asia/Shanghai\') 2021-08-15 18:57:56.492 DateTime64(3, \'Asia/Shanghai\') +Asia/Singapore 2010-11-04 01:42:54 diff --git a/tests/queries/0_stateless/01942_snowflakeToDateTime.sql b/tests/queries/0_stateless/01942_snowflakeToDateTime.sql index f6f171afabf..3efccdddb2d 100644 --- a/tests/queries/0_stateless/01942_snowflakeToDateTime.sql +++ b/tests/queries/0_stateless/01942_snowflakeToDateTime.sql @@ -29,4 +29,6 @@ SELECT snowflakeToDateTime(i64, tz) as dt, toTypeName(dt), snowflakeToDateTime64(i64, tz) as dt64, - toTypeName(dt64); \ No newline at end of file + toTypeName(dt64); + +SELECT materialize('Asia/Singapore') a, snowflakeToDateTime(649::Int64, a) settings allow_nonconst_timezone_arguments = 1 From 326a3a3e8d719aebdc9ef9ee79f8b5fc8645183e Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 12 Jun 2023 16:46:10 +0000 Subject: [PATCH 0793/1072] Use query tree to rewrite the query --- src/Storages/StorageDistributed.cpp | 335 +--------------- src/Storages/StorageReplicatedMergeTree.cpp | 22 +- src/Storages/buildQueryTreeForShard.cpp | 372 ++++++++++++++++++ src/Storages/buildQueryTreeForShard.h | 15 + ...02771_parallel_replicas_analyzer.reference | 2 +- 5 files changed, 406 insertions(+), 340 deletions(-) create mode 100644 src/Storages/buildQueryTreeForShard.cpp create mode 100644 src/Storages/buildQueryTreeForShard.h diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index b91ad0b963a..1ec45ce3d57 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -81,6 +81,7 @@ #include #include +#include #include #include @@ -650,264 +651,6 @@ StorageSnapshotPtr StorageDistributed::getStorageSnapshotForQuery( namespace { -/// Visitor that collect column source to columns mapping from query and all subqueries -class CollectColumnSourceToColumnsVisitor : public InDepthQueryTreeVisitor -{ -public: - struct Columns - { - NameSet column_names; - NamesAndTypes columns; - - void addColumn(NameAndTypePair column) - { - if (column_names.contains(column.name)) - return; - - column_names.insert(column.name); - columns.push_back(std::move(column)); - } - }; - - const std::unordered_map & getColumnSourceToColumns() const - { - return column_source_to_columns; - } - - void visitImpl(QueryTreeNodePtr & node) - { - auto * column_node = node->as(); - if (!column_node) - return; - - auto column_source = column_node->getColumnSourceOrNull(); - if (!column_source) - return; - - auto it = column_source_to_columns.find(column_source); - if (it == column_source_to_columns.end()) - { - auto [insert_it, _] = column_source_to_columns.emplace(column_source, Columns()); - it = insert_it; - } - - it->second.addColumn(column_node->getColumn()); - } - -private: - std::unordered_map column_source_to_columns; -}; - -/** Visitor that rewrites IN and JOINs in query and all subqueries according to distributed_product_mode and - * prefer_global_in_and_join settings. - * - * Additionally collects GLOBAL JOIN and GLOBAL IN query nodes. - * - * If distributed_product_mode = deny, then visitor throws exception if there are multiple distributed tables. - * If distributed_product_mode = local, then visitor collects replacement map for tables that must be replaced - * with local tables. - * If distributed_product_mode = global or prefer_global_in_and_join setting is true, then visitor rewrites JOINs and IN functions that - * contain distributed tables to GLOBAL JOINs and GLOBAL IN functions. - * If distributed_product_mode = allow, then visitor does not rewrite query if there are multiple distributed tables. - */ -class DistributedProductModeRewriteInJoinVisitor : public InDepthQueryTreeVisitorWithContext -{ -public: - using Base = InDepthQueryTreeVisitorWithContext; - using Base::Base; - - explicit DistributedProductModeRewriteInJoinVisitor(const ContextPtr & context_) - : Base(context_) - {} - - struct InFunctionOrJoin - { - QueryTreeNodePtr query_node; - size_t subquery_depth = 0; - }; - - const std::unordered_map & getReplacementMap() const - { - return replacement_map; - } - - const std::vector & getGlobalInOrJoinNodes() const - { - return global_in_or_join_nodes; - } - - static bool needChildVisit(QueryTreeNodePtr & parent, QueryTreeNodePtr & child) - { - auto * function_node = parent->as(); - if (function_node && isNameOfGlobalInFunction(function_node->getFunctionName())) - return false; - - auto * join_node = parent->as(); - if (join_node && join_node->getLocality() == JoinLocality::Global && join_node->getRightTableExpression() == child) - return false; - - return true; - } - - void visitImpl(QueryTreeNodePtr & node) - { - auto * function_node = node->as(); - auto * join_node = node->as(); - - if ((function_node && isNameOfGlobalInFunction(function_node->getFunctionName())) || - (join_node && join_node->getLocality() == JoinLocality::Global)) - { - InFunctionOrJoin in_function_or_join_entry; - in_function_or_join_entry.query_node = node; - in_function_or_join_entry.subquery_depth = getSubqueryDepth(); - global_in_or_join_nodes.push_back(std::move(in_function_or_join_entry)); - return; - } - - if ((function_node && isNameOfLocalInFunction(function_node->getFunctionName())) || - (join_node && join_node->getLocality() != JoinLocality::Global)) - { - InFunctionOrJoin in_function_or_join_entry; - in_function_or_join_entry.query_node = node; - in_function_or_join_entry.subquery_depth = getSubqueryDepth(); - in_function_or_join_stack.push_back(in_function_or_join_entry); - return; - } - - if (node->getNodeType() == QueryTreeNodeType::TABLE) - tryRewriteTableNodeIfNeeded(node); - } - - void leaveImpl(QueryTreeNodePtr & node) - { - if (!in_function_or_join_stack.empty() && node.get() == in_function_or_join_stack.back().query_node.get()) - in_function_or_join_stack.pop_back(); - } - -private: - void tryRewriteTableNodeIfNeeded(const QueryTreeNodePtr & table_node) - { - const auto & table_node_typed = table_node->as(); - const auto * distributed_storage = typeid_cast(table_node_typed.getStorage().get()); - if (!distributed_storage) - return; - - bool distributed_valid_for_rewrite = distributed_storage->getShardCount() >= 2; - if (!distributed_valid_for_rewrite) - return; - - auto distributed_product_mode = getSettings().distributed_product_mode; - - if (distributed_product_mode == DistributedProductMode::LOCAL) - { - StorageID remote_storage_id = StorageID{distributed_storage->getRemoteDatabaseName(), - distributed_storage->getRemoteTableName()}; - auto resolved_remote_storage_id = getContext()->resolveStorageID(remote_storage_id); - const auto & distributed_storage_columns = table_node_typed.getStorageSnapshot()->metadata->getColumns(); - auto storage = std::make_shared(resolved_remote_storage_id, distributed_storage_columns); - auto replacement_table_expression = std::make_shared(std::move(storage), getContext()); - replacement_map.emplace(table_node.get(), std::move(replacement_table_expression)); - } - else if ((distributed_product_mode == DistributedProductMode::GLOBAL || getSettings().prefer_global_in_and_join) && - !in_function_or_join_stack.empty()) - { - auto * in_or_join_node_to_modify = in_function_or_join_stack.back().query_node.get(); - - if (auto * in_function_to_modify = in_or_join_node_to_modify->as()) - { - auto global_in_function_name = getGlobalInFunctionNameForLocalInFunctionName(in_function_to_modify->getFunctionName()); - auto global_in_function_resolver = FunctionFactory::instance().get(global_in_function_name, getContext()); - in_function_to_modify->resolveAsFunction(global_in_function_resolver->build(in_function_to_modify->getArgumentColumns())); - } - else if (auto * join_node_to_modify = in_or_join_node_to_modify->as()) - { - join_node_to_modify->setLocality(JoinLocality::Global); - } - - global_in_or_join_nodes.push_back(in_function_or_join_stack.back()); - } - else if (distributed_product_mode == DistributedProductMode::ALLOW) - { - return; - } - else if (distributed_product_mode == DistributedProductMode::DENY) - { - throw Exception(ErrorCodes::DISTRIBUTED_IN_JOIN_SUBQUERY_DENIED, - "Double-distributed IN/JOIN subqueries is denied (distributed_product_mode = 'deny'). " - "You may rewrite query to use local tables " - "in subqueries, or use GLOBAL keyword, or set distributed_product_mode to suitable value."); - } - } - - std::vector in_function_or_join_stack; - std::unordered_map replacement_map; - std::vector global_in_or_join_nodes; -}; - -/** Execute subquery node and put result in mutable context temporary table. - * Returns table node that is initialized with temporary table storage. - */ -TableNodePtr executeSubqueryNode(const QueryTreeNodePtr & subquery_node, - ContextMutablePtr & mutable_context, - size_t subquery_depth) -{ - auto subquery_hash = subquery_node->getTreeHash(); - String temporary_table_name = fmt::format("_data_{}_{}", subquery_hash.first, subquery_hash.second); - - const auto & external_tables = mutable_context->getExternalTables(); - auto external_table_it = external_tables.find(temporary_table_name); - if (external_table_it != external_tables.end()) - { - auto temporary_table_expression_node = std::make_shared(external_table_it->second, mutable_context); - temporary_table_expression_node->setTemporaryTableName(temporary_table_name); - return temporary_table_expression_node; - } - - auto subquery_options = SelectQueryOptions(QueryProcessingStage::Complete, subquery_depth, true /*is_subquery*/); - auto context_copy = Context::createCopy(mutable_context); - updateContextForSubqueryExecution(context_copy); - - InterpreterSelectQueryAnalyzer interpreter(subquery_node, context_copy, subquery_options); - auto & query_plan = interpreter.getQueryPlan(); - - auto sample_block_with_unique_names = query_plan.getCurrentDataStream().header; - makeUniqueColumnNamesInBlock(sample_block_with_unique_names); - - if (!blocksHaveEqualStructure(sample_block_with_unique_names, query_plan.getCurrentDataStream().header)) - { - auto actions_dag = ActionsDAG::makeConvertingActions( - query_plan.getCurrentDataStream().header.getColumnsWithTypeAndName(), - sample_block_with_unique_names.getColumnsWithTypeAndName(), - ActionsDAG::MatchColumnsMode::Position); - auto converting_step = std::make_unique(query_plan.getCurrentDataStream(), std::move(actions_dag)); - query_plan.addStep(std::move(converting_step)); - } - - Block sample = interpreter.getSampleBlock(); - NamesAndTypesList columns = sample.getNamesAndTypesList(); - - auto external_storage_holder = TemporaryTableHolder( - mutable_context, - ColumnsDescription{columns}, - ConstraintsDescription{}, - nullptr /*query*/, - true /*create_for_global_subquery*/); - - StoragePtr external_storage = external_storage_holder.getTable(); - auto temporary_table_expression_node = std::make_shared(external_storage, mutable_context); - temporary_table_expression_node->setTemporaryTableName(temporary_table_name); - - auto table_out = external_storage->write({}, external_storage->getInMemoryMetadataPtr(), mutable_context, /*async_insert=*/false); - auto io = interpreter.execute(); - io.pipeline.complete(std::move(table_out)); - CompletedPipelineExecutor executor(io.pipeline); - executor.execute(); - - mutable_context->addExternalTable(temporary_table_name, std::move(external_storage_holder)); - - return temporary_table_expression_node; -} - QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, const StorageSnapshotPtr & distributed_storage_snapshot, const StorageID & remote_storage_id, @@ -963,81 +706,7 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, auto query_tree_to_modify = query_info.query_tree->cloneAndReplace(query_info.table_expression, std::move(replacement_table_expression)); - CollectColumnSourceToColumnsVisitor collect_column_source_to_columns_visitor; - collect_column_source_to_columns_visitor.visit(query_tree_to_modify); - - const auto & column_source_to_columns = collect_column_source_to_columns_visitor.getColumnSourceToColumns(); - - DistributedProductModeRewriteInJoinVisitor visitor(query_info.planner_context->getQueryContext()); - visitor.visit(query_tree_to_modify); - - auto replacement_map = visitor.getReplacementMap(); - const auto & global_in_or_join_nodes = visitor.getGlobalInOrJoinNodes(); - - for (const auto & global_in_or_join_node : global_in_or_join_nodes) - { - if (auto * join_node = global_in_or_join_node.query_node->as()) - { - auto join_right_table_expression = join_node->getRightTableExpression(); - auto join_right_table_expression_node_type = join_right_table_expression->getNodeType(); - - QueryTreeNodePtr subquery_node; - - if (join_right_table_expression_node_type == QueryTreeNodeType::QUERY || - join_right_table_expression_node_type == QueryTreeNodeType::UNION) - { - subquery_node = join_right_table_expression; - } - else if (join_right_table_expression_node_type == QueryTreeNodeType::TABLE || - join_right_table_expression_node_type == QueryTreeNodeType::TABLE_FUNCTION) - { - const auto & columns = column_source_to_columns.at(join_right_table_expression).columns; - subquery_node = buildSubqueryToReadColumnsFromTableExpression(columns, - join_right_table_expression, - planner_context->getQueryContext()); - } - else - { - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Expected JOIN right table expression to be table, table function, query or union node. Actual {}", - join_right_table_expression->formatASTForErrorMessage()); - } - - auto temporary_table_expression_node = executeSubqueryNode(subquery_node, - planner_context->getMutableQueryContext(), - global_in_or_join_node.subquery_depth); - temporary_table_expression_node->setAlias(join_right_table_expression->getAlias()); - - replacement_map.emplace(join_right_table_expression.get(), std::move(temporary_table_expression_node)); - continue; - } - else if (auto * in_function_node = global_in_or_join_node.query_node->as()) - { - auto & in_function_subquery_node = in_function_node->getArguments().getNodes().at(1); - auto in_function_node_type = in_function_subquery_node->getNodeType(); - if (in_function_node_type != QueryTreeNodeType::QUERY && in_function_node_type != QueryTreeNodeType::UNION) - continue; - - auto temporary_table_expression_node = executeSubqueryNode(in_function_subquery_node, - planner_context->getMutableQueryContext(), - global_in_or_join_node.subquery_depth); - - in_function_subquery_node = std::move(temporary_table_expression_node); - } - else - { - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Expected global IN or JOIN query node. Actual {}", - global_in_or_join_node.query_node->formatASTForErrorMessage()); - } - } - - if (!replacement_map.empty()) - query_tree_to_modify = query_tree_to_modify->cloneAndReplace(replacement_map); - - removeGroupingFunctionSpecializations(query_tree_to_modify); - - return query_tree_to_modify; + return buildQueryTreeForShard(query_info, query_tree_to_modify); } } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 61d1442df92..fafb3b124f2 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -21,6 +21,7 @@ #include +#include #include #include #include @@ -74,6 +75,8 @@ #include #include +#include + #include #include #include @@ -4734,20 +4737,27 @@ void StorageReplicatedMergeTree::read( { auto table_id = getStorageID(); - const auto & modified_query_ast = ClusterProxy::rewriteSelectQuery( - local_context, query_info.query, - table_id.database_name, table_id.table_name, /*remote_table_function_ptr*/nullptr); - - auto cluster = local_context->getCluster(local_context->getSettingsRef().cluster_for_parallel_replicas); + ASTPtr modified_query_ast; Block header; if (local_context->getSettingsRef().allow_experimental_analyzer) + { + auto modified_query_tree = buildQueryTreeForShard(query_info, query_info.query_tree); + header = InterpreterSelectQueryAnalyzer::getSampleBlock( - modified_query_ast, local_context, SelectQueryOptions(processed_stage).analyze()); + modified_query_tree, local_context, SelectQueryOptions(processed_stage).analyze()); + modified_query_ast = queryNodeToSelectQuery(modified_query_tree); + } else + { header = InterpreterSelectQuery(modified_query_ast, local_context, SelectQueryOptions(processed_stage).analyze()).getSampleBlock(); + modified_query_ast = ClusterProxy::rewriteSelectQuery(local_context, query_info.query, + table_id.database_name, table_id.table_name, /*remote_table_function_ptr*/nullptr); + } + + auto cluster = local_context->getCluster(local_context->getSettingsRef().cluster_for_parallel_replicas); ClusterProxy::SelectStreamFactory select_stream_factory = ClusterProxy::SelectStreamFactory( diff --git a/src/Storages/buildQueryTreeForShard.cpp b/src/Storages/buildQueryTreeForShard.cpp new file mode 100644 index 00000000000..a42d67d9aa7 --- /dev/null +++ b/src/Storages/buildQueryTreeForShard.cpp @@ -0,0 +1,372 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int DISTRIBUTED_IN_JOIN_SUBQUERY_DENIED; +} + +namespace +{ + +/// Visitor that collect column source to columns mapping from query and all subqueries +class CollectColumnSourceToColumnsVisitor : public InDepthQueryTreeVisitor +{ +public: + struct Columns + { + NameSet column_names; + NamesAndTypes columns; + + void addColumn(NameAndTypePair column) + { + if (column_names.contains(column.name)) + return; + + column_names.insert(column.name); + columns.push_back(std::move(column)); + } + }; + + const std::unordered_map & getColumnSourceToColumns() const + { + return column_source_to_columns; + } + + void visitImpl(QueryTreeNodePtr & node) + { + auto * column_node = node->as(); + if (!column_node) + return; + + auto column_source = column_node->getColumnSourceOrNull(); + if (!column_source) + return; + + auto it = column_source_to_columns.find(column_source); + if (it == column_source_to_columns.end()) + { + auto [insert_it, _] = column_source_to_columns.emplace(column_source, Columns()); + it = insert_it; + } + + it->second.addColumn(column_node->getColumn()); + } + +private: + std::unordered_map column_source_to_columns; +}; + +/** Visitor that rewrites IN and JOINs in query and all subqueries according to distributed_product_mode and + * prefer_global_in_and_join settings. + * + * Additionally collects GLOBAL JOIN and GLOBAL IN query nodes. + * + * If distributed_product_mode = deny, then visitor throws exception if there are multiple distributed tables. + * If distributed_product_mode = local, then visitor collects replacement map for tables that must be replaced + * with local tables. + * If distributed_product_mode = global or prefer_global_in_and_join setting is true, then visitor rewrites JOINs and IN functions that + * contain distributed tables to GLOBAL JOINs and GLOBAL IN functions. + * If distributed_product_mode = allow, then visitor does not rewrite query if there are multiple distributed tables. + */ +class DistributedProductModeRewriteInJoinVisitor : public InDepthQueryTreeVisitorWithContext +{ +public: + using Base = InDepthQueryTreeVisitorWithContext; + using Base::Base; + + explicit DistributedProductModeRewriteInJoinVisitor(const ContextPtr & context_) + : Base(context_) + {} + + struct InFunctionOrJoin + { + QueryTreeNodePtr query_node; + size_t subquery_depth = 0; + }; + + const std::unordered_map & getReplacementMap() const + { + return replacement_map; + } + + const std::vector & getGlobalInOrJoinNodes() const + { + return global_in_or_join_nodes; + } + + static bool needChildVisit(QueryTreeNodePtr & parent, QueryTreeNodePtr & child) + { + auto * function_node = parent->as(); + if (function_node && isNameOfGlobalInFunction(function_node->getFunctionName())) + return false; + + auto * join_node = parent->as(); + if (join_node && join_node->getLocality() == JoinLocality::Global && join_node->getRightTableExpression() == child) + return false; + + return true; + } + + void visitImpl(QueryTreeNodePtr & node) + { + auto * function_node = node->as(); + auto * join_node = node->as(); + + if ((function_node && isNameOfGlobalInFunction(function_node->getFunctionName())) || + (join_node && join_node->getLocality() == JoinLocality::Global)) + { + InFunctionOrJoin in_function_or_join_entry; + in_function_or_join_entry.query_node = node; + in_function_or_join_entry.subquery_depth = getSubqueryDepth(); + global_in_or_join_nodes.push_back(std::move(in_function_or_join_entry)); + return; + } + + if ((function_node && isNameOfLocalInFunction(function_node->getFunctionName())) || + (join_node && join_node->getLocality() != JoinLocality::Global)) + { + InFunctionOrJoin in_function_or_join_entry; + in_function_or_join_entry.query_node = node; + in_function_or_join_entry.subquery_depth = getSubqueryDepth(); + in_function_or_join_stack.push_back(in_function_or_join_entry); + return; + } + + if (node->getNodeType() == QueryTreeNodeType::TABLE) + tryRewriteTableNodeIfNeeded(node); + } + + void leaveImpl(QueryTreeNodePtr & node) + { + if (!in_function_or_join_stack.empty() && node.get() == in_function_or_join_stack.back().query_node.get()) + in_function_or_join_stack.pop_back(); + } + +private: + void tryRewriteTableNodeIfNeeded(const QueryTreeNodePtr & table_node) + { + const auto & table_node_typed = table_node->as(); + const auto * distributed_storage = typeid_cast(table_node_typed.getStorage().get()); + if (!distributed_storage) + return; + + bool distributed_valid_for_rewrite = distributed_storage->getShardCount() >= 2; + if (!distributed_valid_for_rewrite) + return; + + auto distributed_product_mode = getSettings().distributed_product_mode; + + if (distributed_product_mode == DistributedProductMode::LOCAL) + { + StorageID remote_storage_id = StorageID{distributed_storage->getRemoteDatabaseName(), + distributed_storage->getRemoteTableName()}; + auto resolved_remote_storage_id = getContext()->resolveStorageID(remote_storage_id); + const auto & distributed_storage_columns = table_node_typed.getStorageSnapshot()->metadata->getColumns(); + auto storage = std::make_shared(resolved_remote_storage_id, distributed_storage_columns); + auto replacement_table_expression = std::make_shared(std::move(storage), getContext()); + replacement_map.emplace(table_node.get(), std::move(replacement_table_expression)); + } + else if ((distributed_product_mode == DistributedProductMode::GLOBAL || getSettings().prefer_global_in_and_join) && + !in_function_or_join_stack.empty()) + { + auto * in_or_join_node_to_modify = in_function_or_join_stack.back().query_node.get(); + + if (auto * in_function_to_modify = in_or_join_node_to_modify->as()) + { + auto global_in_function_name = getGlobalInFunctionNameForLocalInFunctionName(in_function_to_modify->getFunctionName()); + auto global_in_function_resolver = FunctionFactory::instance().get(global_in_function_name, getContext()); + in_function_to_modify->resolveAsFunction(global_in_function_resolver->build(in_function_to_modify->getArgumentColumns())); + } + else if (auto * join_node_to_modify = in_or_join_node_to_modify->as()) + { + join_node_to_modify->setLocality(JoinLocality::Global); + } + + global_in_or_join_nodes.push_back(in_function_or_join_stack.back()); + } + else if (distributed_product_mode == DistributedProductMode::ALLOW) + { + return; + } + else if (distributed_product_mode == DistributedProductMode::DENY) + { + throw Exception(ErrorCodes::DISTRIBUTED_IN_JOIN_SUBQUERY_DENIED, + "Double-distributed IN/JOIN subqueries is denied (distributed_product_mode = 'deny'). " + "You may rewrite query to use local tables " + "in subqueries, or use GLOBAL keyword, or set distributed_product_mode to suitable value."); + } + } + + std::vector in_function_or_join_stack; + std::unordered_map replacement_map; + std::vector global_in_or_join_nodes; +}; + +/** Execute subquery node and put result in mutable context temporary table. + * Returns table node that is initialized with temporary table storage. + */ +TableNodePtr executeSubqueryNode(const QueryTreeNodePtr & subquery_node, + ContextMutablePtr & mutable_context, + size_t subquery_depth) +{ + auto subquery_hash = subquery_node->getTreeHash(); + String temporary_table_name = fmt::format("_data_{}_{}", subquery_hash.first, subquery_hash.second); + + const auto & external_tables = mutable_context->getExternalTables(); + auto external_table_it = external_tables.find(temporary_table_name); + if (external_table_it != external_tables.end()) + { + auto temporary_table_expression_node = std::make_shared(external_table_it->second, mutable_context); + temporary_table_expression_node->setTemporaryTableName(temporary_table_name); + return temporary_table_expression_node; + } + + auto subquery_options = SelectQueryOptions(QueryProcessingStage::Complete, subquery_depth, true /*is_subquery*/); + auto context_copy = Context::createCopy(mutable_context); + updateContextForSubqueryExecution(context_copy); + + InterpreterSelectQueryAnalyzer interpreter(subquery_node, context_copy, subquery_options); + auto & query_plan = interpreter.getQueryPlan(); + + auto sample_block_with_unique_names = query_plan.getCurrentDataStream().header; + makeUniqueColumnNamesInBlock(sample_block_with_unique_names); + + if (!blocksHaveEqualStructure(sample_block_with_unique_names, query_plan.getCurrentDataStream().header)) + { + auto actions_dag = ActionsDAG::makeConvertingActions( + query_plan.getCurrentDataStream().header.getColumnsWithTypeAndName(), + sample_block_with_unique_names.getColumnsWithTypeAndName(), + ActionsDAG::MatchColumnsMode::Position); + auto converting_step = std::make_unique(query_plan.getCurrentDataStream(), std::move(actions_dag)); + query_plan.addStep(std::move(converting_step)); + } + + Block sample = interpreter.getSampleBlock(); + NamesAndTypesList columns = sample.getNamesAndTypesList(); + + auto external_storage_holder = TemporaryTableHolder( + mutable_context, + ColumnsDescription{columns}, + ConstraintsDescription{}, + nullptr /*query*/, + true /*create_for_global_subquery*/); + + StoragePtr external_storage = external_storage_holder.getTable(); + auto temporary_table_expression_node = std::make_shared(external_storage, mutable_context); + temporary_table_expression_node->setTemporaryTableName(temporary_table_name); + + auto table_out = external_storage->write({}, external_storage->getInMemoryMetadataPtr(), mutable_context, /*async_insert=*/false); + auto io = interpreter.execute(); + io.pipeline.complete(std::move(table_out)); + CompletedPipelineExecutor executor(io.pipeline); + executor.execute(); + + mutable_context->addExternalTable(temporary_table_name, std::move(external_storage_holder)); + + return temporary_table_expression_node; +} + +} + +QueryTreeNodePtr buildQueryTreeForShard(SelectQueryInfo & query_info, QueryTreeNodePtr query_tree_to_modify) +{ + auto & planner_context = query_info.planner_context; + const auto & query_context = planner_context->getQueryContext(); + + CollectColumnSourceToColumnsVisitor collect_column_source_to_columns_visitor; + collect_column_source_to_columns_visitor.visit(query_tree_to_modify); + + const auto & column_source_to_columns = collect_column_source_to_columns_visitor.getColumnSourceToColumns(); + + DistributedProductModeRewriteInJoinVisitor visitor(query_info.planner_context->getQueryContext()); + visitor.visit(query_tree_to_modify); + + auto replacement_map = visitor.getReplacementMap(); + const auto & global_in_or_join_nodes = visitor.getGlobalInOrJoinNodes(); + + for (const auto & global_in_or_join_node : global_in_or_join_nodes) + { + if (auto * join_node = global_in_or_join_node.query_node->as()) + { + auto join_right_table_expression = join_node->getRightTableExpression(); + auto join_right_table_expression_node_type = join_right_table_expression->getNodeType(); + + QueryTreeNodePtr subquery_node; + + if (join_right_table_expression_node_type == QueryTreeNodeType::QUERY || + join_right_table_expression_node_type == QueryTreeNodeType::UNION) + { + subquery_node = join_right_table_expression; + } + else if (join_right_table_expression_node_type == QueryTreeNodeType::TABLE || + join_right_table_expression_node_type == QueryTreeNodeType::TABLE_FUNCTION) + { + const auto & columns = column_source_to_columns.at(join_right_table_expression).columns; + subquery_node = buildSubqueryToReadColumnsFromTableExpression(columns, + join_right_table_expression, + planner_context->getQueryContext()); + } + else + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Expected JOIN right table expression to be table, table function, query or union node. Actual {}", + join_right_table_expression->formatASTForErrorMessage()); + } + + auto temporary_table_expression_node = executeSubqueryNode(subquery_node, + planner_context->getMutableQueryContext(), + global_in_or_join_node.subquery_depth); + temporary_table_expression_node->setAlias(join_right_table_expression->getAlias()); + + replacement_map.emplace(join_right_table_expression.get(), std::move(temporary_table_expression_node)); + continue; + } + else if (auto * in_function_node = global_in_or_join_node.query_node->as()) + { + auto & in_function_subquery_node = in_function_node->getArguments().getNodes().at(1); + auto in_function_node_type = in_function_subquery_node->getNodeType(); + if (in_function_node_type != QueryTreeNodeType::QUERY && in_function_node_type != QueryTreeNodeType::UNION) + continue; + + auto temporary_table_expression_node = executeSubqueryNode(in_function_subquery_node, + planner_context->getMutableQueryContext(), + global_in_or_join_node.subquery_depth); + + in_function_subquery_node = std::move(temporary_table_expression_node); + } + else + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Expected global IN or JOIN query node. Actual {}", + global_in_or_join_node.query_node->formatASTForErrorMessage()); + } + } + + if (!replacement_map.empty()) + query_tree_to_modify = query_tree_to_modify->cloneAndReplace(replacement_map); + + removeGroupingFunctionSpecializations(query_tree_to_modify); + + return query_tree_to_modify; +} + +} diff --git a/src/Storages/buildQueryTreeForShard.h b/src/Storages/buildQueryTreeForShard.h new file mode 100644 index 00000000000..05d63faeb9f --- /dev/null +++ b/src/Storages/buildQueryTreeForShard.h @@ -0,0 +1,15 @@ +#pragma once + +#include + +namespace DB +{ + +struct SelectQueryInfo; + +class IQueryTreeNode; +using QueryTreeNodePtr = std::shared_ptr; + +QueryTreeNodePtr buildQueryTreeForShard(SelectQueryInfo & query_info, QueryTreeNodePtr query_tree_to_modify); + +} diff --git a/tests/queries/0_stateless/02771_parallel_replicas_analyzer.reference b/tests/queries/0_stateless/02771_parallel_replicas_analyzer.reference index 4e93c530f7b..f688db940d9 100644 --- a/tests/queries/0_stateless/02771_parallel_replicas_analyzer.reference +++ b/tests/queries/0_stateless/02771_parallel_replicas_analyzer.reference @@ -9,4 +9,4 @@ 7885388429666205427 8124171311239967992 1 1 -- Simple query with analyzer and pure parallel replicas\nSELECT number\nFROM join_inner_table__fuzz_146_replicated\n SETTINGS\n allow_experimental_analyzer = 1,\n max_parallel_replicas = 2,\n cluster_for_parallel_replicas = \'test_cluster_one_shard_three_replicas_localhost\',\n allow_experimental_parallel_reading_from_replicas = 1,\n use_hedged_requests = 0; -0 2 SELECT `default`.`join_inner_table__fuzz_146_replicated`.`number` AS `number` FROM `default`.`join_inner_table__fuzz_146_replicated` +0 2 SELECT `join_inner_table__fuzz_146_replicated`.`number` AS `number` FROM `default`.`join_inner_table__fuzz_146_replicated` SETTINGS allow_experimental_analyzer = 1, max_parallel_replicas = 2, cluster_for_parallel_replicas = \'test_cluster_one_shard_three_replicas_localhost\', allow_experimental_parallel_reading_from_replicas = 1, use_hedged_requests = 0 From d05f89f8f5ec3793256cae1557e2af60650290cf Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 12 Jun 2023 17:33:15 +0000 Subject: [PATCH 0794/1072] Fix style --- src/Storages/StorageDistributed.cpp | 1 - src/Storages/buildQueryTreeForShard.cpp | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 1ec45ce3d57..0472ce6f832 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -154,7 +154,6 @@ namespace ErrorCodes extern const int DISTRIBUTED_TOO_MANY_PENDING_BYTES; extern const int ARGUMENT_OUT_OF_BOUND; extern const int TOO_LARGE_DISTRIBUTED_DEPTH; - extern const int DISTRIBUTED_IN_JOIN_SUBQUERY_DENIED; } namespace ActionLocks diff --git a/src/Storages/buildQueryTreeForShard.cpp b/src/Storages/buildQueryTreeForShard.cpp index a42d67d9aa7..fa4730cbe84 100644 --- a/src/Storages/buildQueryTreeForShard.cpp +++ b/src/Storages/buildQueryTreeForShard.cpp @@ -21,6 +21,7 @@ namespace DB namespace ErrorCodes { + extern const int LOGICAL_ERROR; extern const int DISTRIBUTED_IN_JOIN_SUBQUERY_DENIED; } From a4285d56b22aafe453309ce728c7380666626576 Mon Sep 17 00:00:00 2001 From: ltrk2 <107155950+ltrk2@users.noreply.github.com> Date: Wed, 31 May 2023 12:37:46 -0700 Subject: [PATCH 0795/1072] Fix compilation error on big-endian platforms --- src/Functions/FunctionsCodingIP.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/FunctionsCodingIP.cpp b/src/Functions/FunctionsCodingIP.cpp index 2671418fc7b..897b24d90e0 100644 --- a/src/Functions/FunctionsCodingIP.cpp +++ b/src/Functions/FunctionsCodingIP.cpp @@ -580,7 +580,7 @@ private: #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ unalignedStoreLittleEndian(buf + 8, 0x00000000FFFF0000ull | (static_cast(ntohl(in)) << 32)); #else - unalignedStoreLittleEndian(buf + 8, 0x00000000FFFF0000ull | (static_cast(__builtin_bswap32(in))) << 32)); + unalignedStoreLittleEndian(buf + 8, 0x00000000FFFF0000ull | (static_cast(std::byteswap(in))) << 32); #endif } }; From edb4a644b1ef9fdcba7f53c60ed37438d610ae9a Mon Sep 17 00:00:00 2001 From: ltrk2 <107155950+ltrk2@users.noreply.github.com> Date: Thu, 8 Jun 2023 10:21:24 -0400 Subject: [PATCH 0796/1072] Update FunctionsCodingIP.cpp --- src/Functions/FunctionsCodingIP.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/FunctionsCodingIP.cpp b/src/Functions/FunctionsCodingIP.cpp index 897b24d90e0..7bdbac6531d 100644 --- a/src/Functions/FunctionsCodingIP.cpp +++ b/src/Functions/FunctionsCodingIP.cpp @@ -580,7 +580,7 @@ private: #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ unalignedStoreLittleEndian(buf + 8, 0x00000000FFFF0000ull | (static_cast(ntohl(in)) << 32)); #else - unalignedStoreLittleEndian(buf + 8, 0x00000000FFFF0000ull | (static_cast(std::byteswap(in))) << 32); + unalignedStoreLittleEndian(buf + 8, 0x00000000FFFF0000ull | (static_cast(std::byteswap(in)) << 32)); #endif } }; From e6e8576864421bd2db043d269dfe65d1bf4f85aa Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Mon, 12 Jun 2023 17:04:33 -0300 Subject: [PATCH 0797/1072] Update mergetree.md --- docs/en/engines/table-engines/mergetree-family/mergetree.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 61276110138..dbde1a90f67 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -1138,7 +1138,7 @@ These parameters define the cache layer: Cache parameters: - `path` — The path where metadata for the cache is stored. -- `max_size` — The size (amount of memory) that the cache can grow to. +- `max_size` — The size (amount of disk space) that the cache can grow to. :::tip There are several other cache parameters that you can use to tune your storage, see [using local cache](/docs/en/operations/storing-data.md/#using-local-cache) for the details. From 4f39ee51ae867b219735290125f8dc91d461abf6 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 12 Jun 2023 20:06:57 +0000 Subject: [PATCH 0798/1072] Update Annoy docs --- .../mergetree-family/annindexes.md | 20 +++++++++---------- src/Parsers/ASTIndexDeclaration.h | 3 +++ src/Parsers/ParserCreateIndexQuery.cpp | 4 ++-- src/Parsers/ParserCreateQuery.cpp | 4 ++-- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 2b0b77a0735..16e244077a7 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -100,7 +100,7 @@ ANN indexes support two types of queries: :::tip To avoid writing out large vectors, you can use [query -parameters](/docs/en//interfaces/cli.md#queries-with-parameters-cli-queries-with-parameters), e.g. +parameters](/docs/en/interfaces/cli.md#queries-with-parameters-cli-queries-with-parameters), e.g. ```bash clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Distance(vectors, {vec: Array(Float32)}) < 1.0" @@ -128,14 +128,14 @@ granularity of granules, sub-indexes extrapolate matching rows to granule granul skip data at the granularity of index blocks. The `GRANULARITY` parameter determines how many ANN sub-indexes are created. Bigger `GRANULARITY` values mean fewer but larger ANN -sub-indexes, up to the point where a column (or a column part) has only a single sub-index. In that case, the sub-index has a "global" view of -all column rows and can directly return all granules of the column (part) with relevant rows (there are at at most `LIMIT `-many -such granules). In a second step, ClickHouse will load these granules and identify the actually best rows by performing a brute-force distance -calculation over all rows of the granules. With a small `GRANULARITY` value, each of the sub-indexes returns up to `LIMIT N`-many granules. -As a result, more granules need to be loaded and post-filtered. Note that the search accuracy is with both cases equally good, only the -processing performance differs. It is generally recommended to use a large `GRANULARITY` for ANN indexes and fall back to a smaller -`GRANULARITY` values only in case of problems like excessive memory consumption of the ANN structures. If no `GRANULARITY` was specified for -ANN indexes, the default value is 100 million. +sub-indexes, up to the point where a column (or a column's data part) has only a single sub-index. In that case, the sub-index has a +"global" view of all column rows and can directly return all granules of the column (part) with relevant rows (there are at most `LIMIT +`-many such granules). In a second step, ClickHouse will load these granules and identify the actually best rows by performing a +brute-force distance calculation over all rows of the granules. With a small `GRANULARITY` value, each of the sub-indexes returns up to +`LIMIT N`-many granules. As a result, more granules need to be loaded and post-filtered. Note that the search accuracy is with both cases +equally good, only the processing performance differs. It is generally recommended to use a large `GRANULARITY` for ANN indexes and fall +back to a smaller `GRANULARITY` values only in case of problems like excessive memory consumption of the ANN structures. If no `GRANULARITY` +was specified for ANN indexes, the default value is 100 million. # Available ANN Indexes @@ -204,7 +204,7 @@ values mean more accurate results at the cost of longer query runtime: ``` sql SELECT * -FROM table_name [WHERE ...] +FROM table_name ORDER BY L2Distance(vectors, Point) LIMIT N SETTINGS annoy_index_search_k_nodes=100 diff --git a/src/Parsers/ASTIndexDeclaration.h b/src/Parsers/ASTIndexDeclaration.h index bd52a611f3f..6ed241f75ab 100644 --- a/src/Parsers/ASTIndexDeclaration.h +++ b/src/Parsers/ASTIndexDeclaration.h @@ -12,6 +12,9 @@ class ASTFunction; class ASTIndexDeclaration : public IAST { public: + static const auto DEFAULT_INDEX_GRANULARITY = 1uz; + static const auto DEFAULT_ANNOY_INDEX_GRANULARITY = 100'000'000uz; + String name; IAST * expr; ASTFunction * type; diff --git a/src/Parsers/ParserCreateIndexQuery.cpp b/src/Parsers/ParserCreateIndexQuery.cpp index 57afd3fb99e..f231573b920 100644 --- a/src/Parsers/ParserCreateIndexQuery.cpp +++ b/src/Parsers/ParserCreateIndexQuery.cpp @@ -52,9 +52,9 @@ bool ParserCreateIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected else { if (index->type->name == "annoy") - index->granularity = 100'000'000; + index->granularity = ASTIndexDeclaration::DEFAULT_ANNOY_INDEX_GRANULARITY; else - index->granularity = 1; + index->granularity = ASTIndexDeclaration::DEFAULT_INDEX_GRANULARITY; } node = index; diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index c6273f369b1..adf3513ba40 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -147,9 +147,9 @@ bool ParserIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expe else { if (index->type->name == "annoy") - index->granularity = 100'000'000; + index->granularity = ASTIndexDeclaration::DEFAULT_ANNOY_INDEX_GRANULARITY; else - index->granularity = 1; + index->granularity = ASTIndexDeclaration::DEFAULT_INDEX_GRANULARITY; } node = index; From 002c15823c26c3e0c577a4dd8ec8f319b3120a78 Mon Sep 17 00:00:00 2001 From: ltrk2 <107155950+ltrk2@users.noreply.github.com> Date: Mon, 12 Jun 2023 16:44:46 -0700 Subject: [PATCH 0799/1072] Perform in-place endianness transform because of padding --- src/AggregateFunctions/ReservoirSamplerDeterministic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/AggregateFunctions/ReservoirSamplerDeterministic.h b/src/AggregateFunctions/ReservoirSamplerDeterministic.h index b1a39a5dcc5..25d3b182654 100644 --- a/src/AggregateFunctions/ReservoirSamplerDeterministic.h +++ b/src/AggregateFunctions/ReservoirSamplerDeterministic.h @@ -190,12 +190,12 @@ public: /// Here we ensure that padding is zero without changing the protocol. /// TODO: After implementation of "versioning aggregate function state", /// change the serialization format. - Element elem; memset(&elem, 0, sizeof(elem)); elem = samples[i]; - writeBinaryLittleEndian(elem, buf); + DB::transformEndianness(elem); + DB::writeString(reinterpret_cast(&elem), sizeof(elem), buf); } } From 959fde4491e33586916efcf689ba1a4b361e6865 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Tue, 13 Jun 2023 09:33:38 +0800 Subject: [PATCH 0800/1072] add notifications in docs --- docs/en/engines/table-engines/integrations/redis.md | 9 ++++++++- src/Storages/StorageRedis.cpp | 7 +++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/redis.md b/docs/en/engines/table-engines/integrations/redis.md index 6cfc60c836c..68235a89d33 100644 --- a/docs/en/engines/table-engines/integrations/redis.md +++ b/docs/en/engines/table-engines/integrations/redis.md @@ -6,7 +6,7 @@ sidebar_label: Redis # Redis -This engine allows integrating ClickHouse with [Redis](https://redis.io/). +This engine allows integrating ClickHouse with [Redis](https://redis.io/). For Redis takes kv model, we strongly recommend you only query it in a point way, such as `where k=xx` or `where k in (xx, xx)`. ## Creating a Table {#creating-a-table} @@ -110,3 +110,10 @@ Flush Redis db asynchronously. Also `Truncate` support SYNC mode. ```sql TRUNCATE TABLE redis_table SYNC; ``` + + +## Limitations {#limitations} + +Redis engine also support scanning query, such as `where k > xx`, but it has some limitations: +1. Scanning query may produce some duplicated keys in a very rare case when it is rehashing, details see [Redis Scan](https://github.com/redis/redis/blob/e4d183afd33e0b2e6e8d1c79a832f678a04a7886/src/dict.c#L1186-L1269) +2. During the scanning keys could be created and deleted, so the resulting dataset can not represent a valid point in time. diff --git a/src/Storages/StorageRedis.cpp b/src/Storages/StorageRedis.cpp index 71c84443d8e..ddb1b62c7b0 100644 --- a/src/Storages/StorageRedis.cpp +++ b/src/Storages/StorageRedis.cpp @@ -3,12 +3,12 @@ #include #include #include +#include #include #include #include #include #include -#include #include #include @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -87,9 +88,11 @@ public: return storage.getBySerializedKeys(raw_keys, nullptr); } - /// TODO scan may get duplicated keys + /// TODO scan may get duplicated keys when Redis is rehashing, it is a very rare case. Chunk generateFullScan() { + checkStackSize(); + /// redis scan ending if (iterator == 0) return {}; From 2395b25f9e8828ea1adbf6d303832a1ea7ee97a8 Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Tue, 13 Jun 2023 01:55:34 +0000 Subject: [PATCH 0801/1072] Changes after review --- docs/en/interfaces/cli.md | 18 +++++++++++------- docs/ru/interfaces/cli.md | 14 +++++++++----- src/Client/ConnectionString.cpp | 6 +++--- .../02784_connection_string.reference | 1 + .../0_stateless/02784_connection_string.sh | 3 +++ 5 files changed, 27 insertions(+), 15 deletions(-) diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index fc24bdcad68..6736d05e65f 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -202,15 +202,16 @@ Instead of --host, --port, --user and --password options, ClickHouse client also clickhouse-client alternatively supports connecting to clickhouse server using a connection string similar to [MongoDB](https://www.mongodb.com/docs/manual/reference/connection-string/), [PostgreSQL](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING), [MySQL](https://dev.mysql.com/doc/refman/8.0/en/connecting-using-uri-or-key-value-pairs.html#connecting-using-uri). It has the following syntax: ```text -clickhouse:[//[user_info@][hosts_and_ports]][/database][?query_parameters] +clickhouse:[//[user[:password]@][hosts_and_ports]][/database][?query_parameters] ``` Where -- `user_spec` - (optional) is a user and an optional password, -- `hostspec` - (optional) is a list of hosts and optional ports `host[:port] [, host:[port]], ...`, +- `user` - (optional) is a user name, +- `password` - (optional) is a user password. If `:` is specified and the password is blank, the client will prompt for the user's password. +- `hosts_and_ports` - (optional) is a list of hosts and optional ports `host[:port] [, host:[port]], ...`, - `database` - (optional) is the database name, -- `paramspec` - (optional) is a list of key-value pairs `param1=value1[,¶m2=value2], ...`. For some parameters, no value is required. Parameter names and values are case-sensitive. +- `query_parameters` - (optional) is a list of key-value pairs `param1=value1[,¶m2=value2], ...`. For some parameters, no value is required. Parameter names and values are case-sensitive. @@ -239,7 +240,7 @@ URI allows multiple hosts to be connected to. Connection strings can contain mul ### Percent encoding {#connection_string_uri_percent_encoding} -Non-US ASCII characters in the user name, password, hosts, database or query parameters must be [percent-encoded](https://en.wikipedia.org/wiki/URL_encoding). +Non-US ASCII, spaces and special characters, and special characters in the `user`, `password`, `hosts`, `database` and `query parameters` must be [percent-encoded](https://en.wikipedia.org/wiki/URL_encoding). ### Examples {#connection_string_examples} @@ -306,10 +307,13 @@ Connect to default host using default port, default user, and default database. clickhouse-client clickhouse: ``` -Connect to the default host using the default port, using user user_name and no password. +Connect to the default host using the default port, using user `my_user` and no password. ``` bash -clickhouse-client clickhouse://user_name@ +clickhouse-client clickhouse://my_user@ + +# Using a blank password between : and @ means to asking user to enter the password before starting the connection. +clickhouse-client clickhouse://my_user:@ ``` Connect to localhost using email as the user name. `@` symbol is percent encoded to `%40`. diff --git a/docs/ru/interfaces/cli.md b/docs/ru/interfaces/cli.md index ee29b122afb..794ac60ec83 100644 --- a/docs/ru/interfaces/cli.md +++ b/docs/ru/interfaces/cli.md @@ -147,12 +147,13 @@ $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="numbe clickhouse-client также поддерживает подключение к серверу clickhouse с помощью строки подключения, аналогичной [MongoDB](https://www.mongodb.com/docs/manual/reference/connection-string/), [PostgreSQL](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING), [MySQL](https://dev.mysql.com/doc/refman/8.0/en/connecting-using-uri-or-key-value-pairs.html#connecting-using-uri). Она имеет следующий синтаксис: ```text -clickhouse:[//[user_info@][hosts_and_ports]][/database][?query_parameters] +clickhouse:[//[user[:password]@][hosts_and_ports]][/database][?query_parameters] ``` Где -- `user_spec` - (необязательно) - это пользователь и необязательный пароль, +- `user` - (необязательно) - это имя пользователя, +- `password` - (необязательно) - Пароль пользователя. Если символ `:` укаказан, и пароль пуст, то клиент запросит ввести пользователя пароль. - `hostspec` - (необязательно) - список хостов и необязательных портов. `host[:port] [, host:[port]], ...`, - `database` - (необязательно) - это имя базы данных, - `paramspec` - (опционально) список пар ключ-значение `param1=value1[,¶m2=value2], ...`. Для некоторых параметров значение не требуется. Имена и значения параметров чувствительны к регистру. @@ -182,7 +183,7 @@ URI позволяет подключаться к нескольким хост ### Кодирование URI {#connection_string_uri_percent_encoding} -Не US ASCII символы в имени пользователя, пароле, хостах, базе данных или параметрах запроса должны быть [закодированы](https://ru.wikipedia.org/wiki/URL#%D0%9A%D0%BE%D0%B4%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_URL). +Не US ASCII и специальные символы в имени пользователя, пароле, хостах, базе данных и параметрах запроса должны быть [закодированы](https://ru.wikipedia.org/wiki/URL#%D0%9A%D0%BE%D0%B4%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_URL). ### Примеры {#connection_string_examples} @@ -248,10 +249,13 @@ clickhouse-client clickhouse://localhost/my_database -s clickhouse-client clickhouse: ``` -Подключиться к хосту по умолчанию через порт по умолчанию, используя имя пользователя user_name без пароля. +Подключиться к хосту по умолчанию через порт по умолчанию, используя имя пользователя `my_user` без пароля. ``` bash -clickhouse-client clickhouse://user_name@ +clickhouse-client clickhouse://my_user@ + +# Использование пустого пароля между : и @ означает, что пользователь должен ввести пароль перед началом соединения. +clickhouse-client clickhouse://my_user:@ ``` Подключиться к localhost, используя электронную почту, как имя пользователя. Символ `@` закодирован как `%40`. diff --git a/src/Client/ConnectionString.cpp b/src/Client/ConnectionString.cpp index e1f39369b2a..8f0a0980f51 100644 --- a/src/Client/ConnectionString.cpp +++ b/src/Client/ConnectionString.cpp @@ -142,11 +142,11 @@ bool tryParseConnectionString( try { /** Poco::URI doesn't support several hosts in URI. - * Split string clickhouse:[user_info]host1:port1, ... , hostN:portN[database]?[query_parameters] + * Split string clickhouse:[user[:password]@]host1:port1, ... , hostN:portN[database]?[query_parameters] * into multiple string for each host: - * clickhouse:[user_info]host1:port1[database]?[query_parameters] + * clickhouse:[user[:password]@]host1:port1[database]?[query_parameters] * ... - * clickhouse:[user_info]hostN:portN[database]?[query_parameters] + * clickhouse:[user[:password]@]hostN:portN[database]?[query_parameters] */ Poco::URI uri; const auto * last_host_begin = connection_string.begin() + offset; diff --git a/tests/queries/0_stateless/02784_connection_string.reference b/tests/queries/0_stateless/02784_connection_string.reference index 6a36abae8e0..9d58d485a14 100644 --- a/tests/queries/0_stateless/02784_connection_string.reference +++ b/tests/queries/0_stateless/02784_connection_string.reference @@ -121,5 +121,6 @@ BAD_ARGUMENTS BAD_ARGUMENTS BAD_ARGUMENTS BAD_ARGUMENTS +BAD_ARGUMENTS Authentication failed Authentication failed diff --git a/tests/queries/0_stateless/02784_connection_string.sh b/tests/queries/0_stateless/02784_connection_string.sh index fce93fdad74..042f5b2108d 100755 --- a/tests/queries/0_stateless/02784_connection_string.sh +++ b/tests/queries/0_stateless/02784_connection_string.sh @@ -116,6 +116,9 @@ runClient "clickhouse:///?" --port "$CLICKHOUSE_PORT_TCP" --host "$CLICKHOUSE_HO runClient "clickhouse://:/?" --port "$CLICKHOUSE_PORT_TCP" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' runClient "clickhouse:" --database "$CLICKHOUSE_DATABASE" --port "$CLICKHOUSE_PORT_TCP" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' +# Using clickhouse-client and connection is prohibited +runClient "clickhouse:" --connection "connection" 2>&1 | grep -o 'BAD_ARGUMENTS' + # Space is used in connection string (This is prohibited). runClient " clickhouse:" 2>&1 | grep -o 'BAD_ARGUMENTS' runClient "clickhouse: " 2>&1 | grep -o 'BAD_ARGUMENTS' From 6839a1318c8a656c20e7a1ed8e256fd51408820e Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Tue, 13 Jun 2023 04:03:30 +0000 Subject: [PATCH 0802/1072] minor changes in docs --- docs/en/interfaces/cli.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index 6736d05e65f..b5134ea30c0 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -240,7 +240,7 @@ URI allows multiple hosts to be connected to. Connection strings can contain mul ### Percent encoding {#connection_string_uri_percent_encoding} -Non-US ASCII, spaces and special characters, and special characters in the `user`, `password`, `hosts`, `database` and `query parameters` must be [percent-encoded](https://en.wikipedia.org/wiki/URL_encoding). +Non-US ASCII, spaces and special characters in the `user`, `password`, `hosts`, `database` and `query parameters` must be [percent-encoded](https://en.wikipedia.org/wiki/URL_encoding). ### Examples {#connection_string_examples} From a3ff5df205ae8353395023ee4ef0bf83bee31458 Mon Sep 17 00:00:00 2001 From: Victor Krasnov Date: Tue, 13 Jun 2023 09:16:06 +0300 Subject: [PATCH 0803/1072] Remove reduntant header SELECT from the test --- .../0_stateless/02783_parseDateTimeBestEffort_syslog.reference | 1 - .../0_stateless/02783_parseDateTimeBestEffort_syslog.sql | 2 -- 2 files changed, 3 deletions(-) diff --git a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference index 3ec93143e0e..ee75d68bff4 100644 --- a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference +++ b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference @@ -1,4 +1,3 @@ -parseDateTimeBestEffort around_June_7 res res_sam res_auc res_null res_null_sam res_null_auc res_zero res_zero_sam res_zero_auc res_us res_us_sam res_us_auc res_us_null res_us_null_sam res_us_null_auc res_us_zero res_us_zero_sam res_us_zero_auc res64 res64_sam res64_auc res64_null res64_null_sam res64_null_auc res64_zero res64_zero_sam res64_zero_auc res64_us res64_us_sam res64_us_auc res64_us_null res64_us_null_sam res64_us_null_auc res64_us_zero res64_us_zero_sam res64_us_zero_auc Jun 6 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 2023-06-06 00:00:00.000 diff --git a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql index 52975cb5bbf..742ae03ddab 100644 --- a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql +++ b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql @@ -1,5 +1,3 @@ -SELECT 'parseDateTimeBestEffort'; - WITH 86400 AS secs_in_day, now() AS ts_now, From 879db5098a80d8c8c391296b672332a3367f6ac9 Mon Sep 17 00:00:00 2001 From: Val Doroshchuk Date: Mon, 12 Jun 2023 13:47:38 +0200 Subject: [PATCH 0804/1072] MaterializedMySQL: Add test_named_collections --- .../configs/users.xml | 1 + .../materialize_with_ddl.py | 31 +++++++++++++++++++ .../test_materialized_mysql_database/test.py | 6 ++++ 3 files changed, 38 insertions(+) diff --git a/tests/integration/test_materialized_mysql_database/configs/users.xml b/tests/integration/test_materialized_mysql_database/configs/users.xml index 3669fbb46ba..7a7529c94bb 100644 --- a/tests/integration/test_materialized_mysql_database/configs/users.xml +++ b/tests/integration/test_materialized_mysql_database/configs/users.xml @@ -14,6 +14,7 @@ ::/0 default + 1 diff --git a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py index 2bbbe9a3f13..7fdb73ea1f3 100644 --- a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py +++ b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py @@ -2265,3 +2265,34 @@ def dropddl(clickhouse_node, mysql_node, mysql_host): ) mysql_node.query(f"DROP DATABASE {db}") clickhouse_node.query(f"DROP DATABASE {db}") + + +def named_collections(clickhouse_node, mysql_node, service_name): + db = "named_collections" + mysql_node.query(f"DROP DATABASE IF EXISTS {db}") + clickhouse_node.query(f"DROP DATABASE IF EXISTS {db}") + mysql_node.query(f"CREATE DATABASE {db}") + mysql_node.query( + f"CREATE TABLE {db}.t1 (id INT PRIMARY KEY, name VARCHAR(64), val INT)" + ) + mysql_node.query( + f"INSERT INTO {db}.t1 (id, name, val) VALUES (1, 'a', 1), (2, 'b', 2)" + ) + + clickhouse_node.query( + f"""CREATE NAMED COLLECTION {db} AS + user = 'root', + password = 'clickhouse', + host = '{service_name}', + port = 3306, + database = '{db}' + """ + ) + clickhouse_node.query(f"CREATE DATABASE {db} ENGINE = MaterializedMySQL({db})") + check_query( + clickhouse_node, + f"/* expect: (1, 'a', 1), (2, 'b', 2) */ SELECT * FROM {db}.t1", + "1\ta\t1\n2\tb\t2\n", + ) + clickhouse_node.query(f"DROP DATABASE IF EXISTS {db}") + mysql_node.query(f"DROP DATABASE IF EXISTS {db}") diff --git a/tests/integration/test_materialized_mysql_database/test.py b/tests/integration/test_materialized_mysql_database/test.py index a22d73061ae..5272fb2ff8c 100644 --- a/tests/integration/test_materialized_mysql_database/test.py +++ b/tests/integration/test_materialized_mysql_database/test.py @@ -523,3 +523,9 @@ def test_materialized_database_mysql_drop_ddl( ): materialize_with_ddl.dropddl(clickhouse_node, started_mysql_8_0, "mysql80") materialize_with_ddl.dropddl(clickhouse_node, started_mysql_5_7, "mysql57") + + +def test_named_collections(started_cluster, started_mysql_8_0, clickhouse_node): + materialize_with_ddl.named_collections( + clickhouse_node, started_mysql_8_0, "mysql80" + ) From 9c939b2f3db3b47116d739a3b81ab7c353e6e0bf Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 13 Jun 2023 10:54:54 +0200 Subject: [PATCH 0805/1072] Fix heading and sidebar for azureBlobStorage table function --- docs/en/sql-reference/table-functions/azureBlobStorage.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/table-functions/azureBlobStorage.md b/docs/en/sql-reference/table-functions/azureBlobStorage.md index 369bf7a964d..7a362710b9c 100644 --- a/docs/en/sql-reference/table-functions/azureBlobStorage.md +++ b/docs/en/sql-reference/table-functions/azureBlobStorage.md @@ -1,10 +1,10 @@ --- slug: /en/sql-reference/table-functions/azure_blob_storage -sidebar_label: azure_blob_storage +sidebar_label: azureBlobStorage keywords: [azure blob storage] --- -# azure\_blob\_storage Table Function +# azureBlobStorage Table Function Provides a table-like interface to select/insert files in [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs). This table function is similar to the [s3 function](../../sql-reference/table-functions/s3.md). From 79bc8847333f6e8e3653e63b1fed6a063bfb6302 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 13 Jun 2023 08:54:25 +0000 Subject: [PATCH 0806/1072] Stabilize tests --- src/Functions/FunctionSnowflake.h | 58 ++++++++++--------- .../01942_snowflakeToDateTime.reference | 3 +- .../0_stateless/01942_snowflakeToDateTime.sql | 10 +++- 3 files changed, 41 insertions(+), 30 deletions(-) diff --git a/src/Functions/FunctionSnowflake.h b/src/Functions/FunctionSnowflake.h index 0a47534c47d..ace2fc54f09 100644 --- a/src/Functions/FunctionSnowflake.h +++ b/src/Functions/FunctionSnowflake.h @@ -54,22 +54,19 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { const auto & src = arguments[0]; - const auto & col = *src.column; + const auto & src_column = *src.column; auto res_column = ColumnInt64::create(input_rows_count); - auto & result_data = res_column->getData(); + auto & res_data = res_column->getData(); - const auto & source_data = typeid_cast(col).getData(); + const auto & src_data = typeid_cast &>(src_column).getData(); for (size_t i = 0; i < input_rows_count; ++i) - { - result_data[i] = (Int64(source_data[i]) * 1000 - snowflake_epoch) << time_shift; - } + res_data[i] = (UInt32(src_data[i]) * 1000 - snowflake_epoch) << time_shift; return res_column; } }; - class FunctionSnowflakeToDateTime : public IFunction { private: @@ -106,23 +103,23 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { const auto & src = arguments[0]; - const auto & col = *src.column; + const auto & src_column = *src.column; auto res_column = ColumnUInt32::create(input_rows_count); - auto & result_data = res_column->getData(); + auto & res_data = res_column->getData(); - if (const auto * src_non_const = typeid_cast(&col)) + if (const auto * src_column_non_const = typeid_cast(&src_column)) { - const auto & source_data = src_non_const->getData(); + const auto & src_data = src_column_non_const->getData(); for (size_t i = 0; i < input_rows_count; ++i) - result_data[i] = static_cast( - ((source_data[i] >> time_shift) + snowflake_epoch) / 1000); + res_data[i] = static_cast( + ((src_data[i] >> time_shift) + snowflake_epoch) / 1000); } - else if (const auto * src_const = typeid_cast(&col)) + else if (const auto * src_column_const = typeid_cast(&src_column)) { - Int64 src_val = src_const->getValue(); + Int64 src_val = src_column_const->getValue(); for (size_t i = 0; i < input_rows_count; ++i) - result_data[i] = static_cast( + res_data[i] = static_cast( ((src_val >> time_shift) + snowflake_epoch) / 1000); } return res_column; @@ -155,16 +152,14 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { const auto & src = arguments[0]; - const auto & col = *src.column; + const auto & src_column = *src.column; auto res_column = ColumnInt64::create(input_rows_count); - auto & result_data = res_column->getData(); + auto & res_data = res_column->getData(); - const auto & source_data = typeid_cast &>(col).getData(); + const auto & src_data = typeid_cast &>(src_column).getData(); for (size_t i = 0; i < input_rows_count; ++i) - { - result_data[i] = (source_data[i] - snowflake_epoch) << time_shift; - } + res_data[i] = (src_data[i] - snowflake_epoch) << time_shift; return res_column; } @@ -207,17 +202,24 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { const auto & src = arguments[0]; - const auto & col = *src.column; + const auto & src_column = *src.column; auto res_column = ColumnDecimal::create(input_rows_count, 3); - auto & result_data = res_column->getData(); + auto & res_data = res_column->getData(); - const auto & source_data = typeid_cast(col).getData(); - - for (size_t i = 0; i < input_rows_count; ++i) + if (const auto * src_column_non_const = typeid_cast(&src_column)) { - result_data[i] = (source_data[i] >> time_shift) + snowflake_epoch; + const auto & src_data = src_column_non_const->getData(); + for (size_t i = 0; i < input_rows_count; ++i) + res_data[i] = (src_data[i] >> time_shift) + snowflake_epoch; } + else if (const auto * src_column_const = typeid_cast(&src_column)) + { + Int64 src_val = src_column_const->getValue(); + for (size_t i = 0; i < input_rows_count; ++i) + res_data[i] = (src_val >> time_shift) + snowflake_epoch; + } + return res_column; } }; diff --git a/tests/queries/0_stateless/01942_snowflakeToDateTime.reference b/tests/queries/0_stateless/01942_snowflakeToDateTime.reference index fa00a22bc63..e1d141fe450 100644 --- a/tests/queries/0_stateless/01942_snowflakeToDateTime.reference +++ b/tests/queries/0_stateless/01942_snowflakeToDateTime.reference @@ -1,4 +1,5 @@ const column UTC 1426860704886947840 2021-08-15 10:57:56 DateTime(\'UTC\') 2021-08-15 10:57:56.492 DateTime64(3, \'UTC\') Asia/Shanghai 1426860704886947840 2021-08-15 18:57:56 DateTime(\'Asia/Shanghai\') 2021-08-15 18:57:56.492 DateTime64(3, \'Asia/Shanghai\') -Asia/Singapore 2010-11-04 01:42:54 +Asia/Singapore 42 +Asia/Singapore 42 diff --git a/tests/queries/0_stateless/01942_snowflakeToDateTime.sql b/tests/queries/0_stateless/01942_snowflakeToDateTime.sql index 3efccdddb2d..2ad03f2a4f5 100644 --- a/tests/queries/0_stateless/01942_snowflakeToDateTime.sql +++ b/tests/queries/0_stateless/01942_snowflakeToDateTime.sql @@ -31,4 +31,12 @@ SELECT snowflakeToDateTime64(i64, tz) as dt64, toTypeName(dt64); -SELECT materialize('Asia/Singapore') a, snowflakeToDateTime(649::Int64, a) settings allow_nonconst_timezone_arguments = 1 + +DROP TABLE IF EXISTS tab; +CREATE TABLE tab(tz String, val Int64) engine=Log; +INSERT INTO tab VALUES ('Asia/Singapore', 42); + +SELECT * FROM tab WHERE snowflakeToDateTime(42::Int64, tz) != now() SETTINGS allow_nonconst_timezone_arguments = 1; +SELECT * FROM tab WHERE snowflakeToDateTime64(42::Int64, tz) != now() SETTINGS allow_nonconst_timezone_arguments = 1; + +DROP TABLE tab; From 2d0dc2c8f5c329a4da12ccb1db601d5edf2044cd Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 13 Jun 2023 08:59:34 +0000 Subject: [PATCH 0807/1072] Minor: Switch column order --- .../0_stateless/01942_snowflakeToDateTime.reference | 4 ++-- tests/queries/0_stateless/01942_snowflakeToDateTime.sql | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/queries/0_stateless/01942_snowflakeToDateTime.reference b/tests/queries/0_stateless/01942_snowflakeToDateTime.reference index e1d141fe450..83fae3ef809 100644 --- a/tests/queries/0_stateless/01942_snowflakeToDateTime.reference +++ b/tests/queries/0_stateless/01942_snowflakeToDateTime.reference @@ -1,5 +1,5 @@ const column UTC 1426860704886947840 2021-08-15 10:57:56 DateTime(\'UTC\') 2021-08-15 10:57:56.492 DateTime64(3, \'UTC\') Asia/Shanghai 1426860704886947840 2021-08-15 18:57:56 DateTime(\'Asia/Shanghai\') 2021-08-15 18:57:56.492 DateTime64(3, \'Asia/Shanghai\') -Asia/Singapore 42 -Asia/Singapore 42 +1 +1 diff --git a/tests/queries/0_stateless/01942_snowflakeToDateTime.sql b/tests/queries/0_stateless/01942_snowflakeToDateTime.sql index 2ad03f2a4f5..0092eca848c 100644 --- a/tests/queries/0_stateless/01942_snowflakeToDateTime.sql +++ b/tests/queries/0_stateless/01942_snowflakeToDateTime.sql @@ -33,10 +33,10 @@ SELECT DROP TABLE IF EXISTS tab; -CREATE TABLE tab(tz String, val Int64) engine=Log; -INSERT INTO tab VALUES ('Asia/Singapore', 42); +CREATE TABLE tab(val Int64, tz String) engine=Log; +INSERT INTO tab VALUES (42, 'Asia/Singapore'); -SELECT * FROM tab WHERE snowflakeToDateTime(42::Int64, tz) != now() SETTINGS allow_nonconst_timezone_arguments = 1; -SELECT * FROM tab WHERE snowflakeToDateTime64(42::Int64, tz) != now() SETTINGS allow_nonconst_timezone_arguments = 1; +SELECT 1 FROM tab WHERE snowflakeToDateTime(42::Int64, tz) != now() SETTINGS allow_nonconst_timezone_arguments = 1; +SELECT 1 FROM tab WHERE snowflakeToDateTime64(42::Int64, tz) != now() SETTINGS allow_nonconst_timezone_arguments = 1; DROP TABLE tab; From 72f28321295187ddf40d02a887be3106f6ec4ac3 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 13 Jun 2023 02:07:05 -0700 Subject: [PATCH 0808/1072] Slightly more information in error message about cached disk (#50897) --- src/Disks/ObjectStorages/Cached/registerDiskCache.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Disks/ObjectStorages/Cached/registerDiskCache.cpp b/src/Disks/ObjectStorages/Cached/registerDiskCache.cpp index 779ec6120f8..2b40fa9c21b 100644 --- a/src/Disks/ObjectStorages/Cached/registerDiskCache.cpp +++ b/src/Disks/ObjectStorages/Cached/registerDiskCache.cpp @@ -48,7 +48,9 @@ void registerDiskCache(DiskFactory & factory, bool /* global_skip_access_check * auto cache = FileCacheFactory::instance().getOrCreate(name, file_cache_settings); auto disk = disk_it->second; if (!dynamic_cast(disk.get())) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cached disk is allowed only on top of object storage"); + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Cannot wrap disk `{}` with cache layer `{}`: cached disk is allowed only on top of object storage", + disk_name, name); auto disk_object_storage = disk->createDiskObjectStorage(); From 0ab3dc92618f8a7d1accd8f2e1cc21f851dead80 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 13 Jun 2023 11:25:13 +0200 Subject: [PATCH 0809/1072] A bit safer UserDefinedSQLFunctionVisitor (#50913) * Update UserDefinedSQLFunctionVisitor.cpp * Update UserDefinedSQLFunctionVisitor.cpp --------- Co-authored-by: Nikita Mikhaylov --- src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp index 57cc45cc75d..597e4efe35e 100644 --- a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp +++ b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp @@ -25,6 +25,12 @@ namespace ErrorCodes void UserDefinedSQLFunctionVisitor::visit(ASTPtr & ast) { + if (!ast) + { + chassert(false); + return; + } + const auto visit_child_with_shared_ptr = [&](ASTPtr & child) { if (!child) From e5de6cde244f530c0f7d3ec1acad462025430e58 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Tue, 13 Jun 2023 11:37:24 +0000 Subject: [PATCH 0810/1072] Update after #50097 --- tests/queries/0_stateless/01655_plan_optimizations.reference | 4 ++-- tests/queries/0_stateless/01655_plan_optimizations.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index 34ea2bc20a3..be42a656c66 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -172,7 +172,7 @@ Filter column: notEquals(number, 1) Join > (analyzer) one condition of filter is pushed down before LEFT JOIN Join -Filter column: notEquals(l.number_0, 1_UInt8) +Filter column: notEquals(number_0, 1_UInt8) 0 0 3 3 > one condition of filter is pushed down before INNER JOIN @@ -181,7 +181,7 @@ Filter column: notEquals(number, 1) Join > (analyzer) one condition of filter is pushed down before INNER JOIN Join -Filter column: notEquals(l.number_0, 1_UInt8) +Filter column: notEquals(number_0, 1_UInt8) 3 3 > filter is pushed down before UNION Union diff --git a/tests/queries/0_stateless/01655_plan_optimizations.sh b/tests/queries/0_stateless/01655_plan_optimizations.sh index d68c2c8b414..a765a6ea4fa 100755 --- a/tests/queries/0_stateless/01655_plan_optimizations.sh +++ b/tests/queries/0_stateless/01655_plan_optimizations.sh @@ -236,7 +236,7 @@ $CLICKHOUSE_CLIENT --allow_experimental_analyzer=1 -q " select number as a, r.b from numbers(4) as l any left join ( select number + 2 as b from numbers(3) ) as r on a = r.b where a != 1 and b != 2 settings enable_optimize_predicate_expression = 0" | - grep -o "Join\|Filter column: notEquals(l.number_0, 1_UInt8)" + grep -o "Join\|Filter column: notEquals(number_0, 1_UInt8)" $CLICKHOUSE_CLIENT -q " select number as a, r.b from numbers(4) as l any left join ( select number + 2 as b from numbers(3) @@ -255,7 +255,7 @@ $CLICKHOUSE_CLIENT --allow_experimental_analyzer=1 -q " select number as a, r.b from numbers(4) as l any inner join ( select number + 2 as b from numbers(3) ) as r on a = r.b where a != 1 and b != 2 settings enable_optimize_predicate_expression = 0" | - grep -o "Join\|Filter column: notEquals(l.number_0, 1_UInt8)" + grep -o "Join\|Filter column: notEquals(number_0, 1_UInt8)" $CLICKHOUSE_CLIENT -q " select number as a, r.b from numbers(4) as l any inner join ( select number + 2 as b from numbers(3) From a8b68a877aae945dee4e37b28e320a967a38f9f2 Mon Sep 17 00:00:00 2001 From: Victor Krasnov Date: Tue, 13 Jun 2023 14:37:33 +0300 Subject: [PATCH 0811/1072] Rename the 'time shift' variable in the test to make it more clear --- .../02783_parseDateTimeBestEffort_syslog.sql | 76 +++++++++---------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql index 742ae03ddab..f3ca78e8310 100644 --- a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql +++ b/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql @@ -2,45 +2,45 @@ WITH 86400 AS secs_in_day, now() AS ts_now, '2023-06-07' AS ref_point, - dateDiff('second', toDateTime(ref_point), ts_now) AS impedimenta, + dateDiff('second', toDateTime(ref_point), ts_now) AS time_shift, formatDateTime(ts_around, '%b %e %T') AS dt_curr SELECT - formatDateTime(ts_around - impedimenta, '%b %e %H:%i:%s') AS around_June_7, - parseDateTimeBestEffort(dt_curr) - impedimenta AS res, - parseDateTimeBestEffort(dt_curr, 'US/Samoa') - impedimenta AS res_sam, - parseDateTimeBestEffort(dt_curr, 'Pacific/Auckland') - impedimenta AS res_auc, - parseDateTimeBestEffortOrNull(dt_curr) - impedimenta AS res_null, - parseDateTimeBestEffortOrNull(dt_curr, 'US/Samoa') - impedimenta AS res_null_sam, - parseDateTimeBestEffortOrNull(dt_curr, 'Pacific/Auckland') - impedimenta AS res_null_auc, - parseDateTimeBestEffortOrZero(dt_curr) - impedimenta AS res_zero, - parseDateTimeBestEffortOrZero(dt_curr, 'US/Samoa') - impedimenta AS res_zero_sam, - parseDateTimeBestEffortOrZero(dt_curr, 'Pacific/Auckland') - impedimenta AS res_zero_auc, - parseDateTimeBestEffortUS(dt_curr) - impedimenta AS res_us, - parseDateTimeBestEffortUS(dt_curr, 'US/Samoa') - impedimenta AS res_us_sam, - parseDateTimeBestEffortUS(dt_curr, 'Pacific/Auckland') - impedimenta AS res_us_auc, - parseDateTimeBestEffortUSOrNull(dt_curr) - impedimenta AS res_us_null, - parseDateTimeBestEffortUSOrNull(dt_curr, 'US/Samoa') - impedimenta AS res_us_null_sam, - parseDateTimeBestEffortUSOrNull(dt_curr, 'Pacific/Auckland') - impedimenta AS res_us_null_auc, - parseDateTimeBestEffortUSOrZero(dt_curr) - impedimenta AS res_us_zero, - parseDateTimeBestEffortUSOrZero(dt_curr, 'US/Samoa') - impedimenta AS res_us_zero_sam, - parseDateTimeBestEffortUSOrZero(dt_curr, 'Pacific/Auckland') - impedimenta AS res_us_zero_auc, - parseDateTime64BestEffort(dt_curr) - impedimenta AS res64, - parseDateTime64BestEffort(dt_curr, 3, 'US/Samoa') - impedimenta AS res64_sam, - parseDateTime64BestEffort(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res64_auc, - parseDateTime64BestEffortOrNull(dt_curr) - impedimenta AS res64_null, - parseDateTime64BestEffortOrNull(dt_curr, 3, 'US/Samoa') - impedimenta AS res64_null_sam, - parseDateTime64BestEffortOrNull(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res64_null_auc, - parseDateTime64BestEffortOrZero(dt_curr) - impedimenta AS res64_zero, - parseDateTime64BestEffortOrZero(dt_curr, 3, 'US/Samoa') - impedimenta AS res64_zero_sam, - parseDateTime64BestEffortOrZero(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res64_zero_auc, - parseDateTime64BestEffortUS(dt_curr) - impedimenta AS res64_us, - parseDateTime64BestEffortUS(dt_curr, 3, 'US/Samoa') - impedimenta AS res64_us_sam, - parseDateTime64BestEffortUS(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res64_us_auc, - parseDateTime64BestEffortUSOrNull(dt_curr) - impedimenta AS res64_us_null, - parseDateTime64BestEffortUSOrNull(dt_curr, 3, 'US/Samoa') - impedimenta AS res64_us_null_sam, - parseDateTime64BestEffortUSOrNull(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res64_us_null_auc, - parseDateTime64BestEffortUSOrZero(dt_curr) - impedimenta AS res64_us_zero, - parseDateTime64BestEffortUSOrZero(dt_curr, 3, 'US/Samoa') - impedimenta AS res64_us_zero_sam, - parseDateTime64BestEffortUSOrZero(dt_curr, 3, 'Pacific/Auckland') - impedimenta AS res64_us_zero_auc + formatDateTime(ts_around - time_shift, '%b %e %H:%i:%s') AS around_June_7, + parseDateTimeBestEffort(dt_curr) - time_shift AS res, + parseDateTimeBestEffort(dt_curr, 'US/Samoa') - time_shift AS res_sam, + parseDateTimeBestEffort(dt_curr, 'Pacific/Auckland') - time_shift AS res_auc, + parseDateTimeBestEffortOrNull(dt_curr) - time_shift AS res_null, + parseDateTimeBestEffortOrNull(dt_curr, 'US/Samoa') - time_shift AS res_null_sam, + parseDateTimeBestEffortOrNull(dt_curr, 'Pacific/Auckland') - time_shift AS res_null_auc, + parseDateTimeBestEffortOrZero(dt_curr) - time_shift AS res_zero, + parseDateTimeBestEffortOrZero(dt_curr, 'US/Samoa') - time_shift AS res_zero_sam, + parseDateTimeBestEffortOrZero(dt_curr, 'Pacific/Auckland') - time_shift AS res_zero_auc, + parseDateTimeBestEffortUS(dt_curr) - time_shift AS res_us, + parseDateTimeBestEffortUS(dt_curr, 'US/Samoa') - time_shift AS res_us_sam, + parseDateTimeBestEffortUS(dt_curr, 'Pacific/Auckland') - time_shift AS res_us_auc, + parseDateTimeBestEffortUSOrNull(dt_curr) - time_shift AS res_us_null, + parseDateTimeBestEffortUSOrNull(dt_curr, 'US/Samoa') - time_shift AS res_us_null_sam, + parseDateTimeBestEffortUSOrNull(dt_curr, 'Pacific/Auckland') - time_shift AS res_us_null_auc, + parseDateTimeBestEffortUSOrZero(dt_curr) - time_shift AS res_us_zero, + parseDateTimeBestEffortUSOrZero(dt_curr, 'US/Samoa') - time_shift AS res_us_zero_sam, + parseDateTimeBestEffortUSOrZero(dt_curr, 'Pacific/Auckland') - time_shift AS res_us_zero_auc, + parseDateTime64BestEffort(dt_curr) - time_shift AS res64, + parseDateTime64BestEffort(dt_curr, 3, 'US/Samoa') - time_shift AS res64_sam, + parseDateTime64BestEffort(dt_curr, 3, 'Pacific/Auckland') - time_shift AS res64_auc, + parseDateTime64BestEffortOrNull(dt_curr) - time_shift AS res64_null, + parseDateTime64BestEffortOrNull(dt_curr, 3, 'US/Samoa') - time_shift AS res64_null_sam, + parseDateTime64BestEffortOrNull(dt_curr, 3, 'Pacific/Auckland') - time_shift AS res64_null_auc, + parseDateTime64BestEffortOrZero(dt_curr) - time_shift AS res64_zero, + parseDateTime64BestEffortOrZero(dt_curr, 3, 'US/Samoa') - time_shift AS res64_zero_sam, + parseDateTime64BestEffortOrZero(dt_curr, 3, 'Pacific/Auckland') - time_shift AS res64_zero_auc, + parseDateTime64BestEffortUS(dt_curr) - time_shift AS res64_us, + parseDateTime64BestEffortUS(dt_curr, 3, 'US/Samoa') - time_shift AS res64_us_sam, + parseDateTime64BestEffortUS(dt_curr, 3, 'Pacific/Auckland') - time_shift AS res64_us_auc, + parseDateTime64BestEffortUSOrNull(dt_curr) - time_shift AS res64_us_null, + parseDateTime64BestEffortUSOrNull(dt_curr, 3, 'US/Samoa') - time_shift AS res64_us_null_sam, + parseDateTime64BestEffortUSOrNull(dt_curr, 3, 'Pacific/Auckland') - time_shift AS res64_us_null_auc, + parseDateTime64BestEffortUSOrZero(dt_curr) - time_shift AS res64_us_zero, + parseDateTime64BestEffortUSOrZero(dt_curr, 3, 'US/Samoa') - time_shift AS res64_us_zero_sam, + parseDateTime64BestEffortUSOrZero(dt_curr, 3, 'Pacific/Auckland') - time_shift AS res64_us_zero_auc FROM (SELECT arrayJoin([ts_now - secs_in_day, ts_now + secs_in_day]) AS ts_around) FORMAT PrettySpaceNoEscapes; From 38151f9c767d12b18e26ccc236244bc929c326bb Mon Sep 17 00:00:00 2001 From: Victor Krasnov Date: Tue, 13 Jun 2023 14:59:38 +0300 Subject: [PATCH 0812/1072] Rename the test to the snake_case after the Team Lead's review --- ...g.reference => 02783_parsedatetimebesteffort_syslog.reference} | 0 ...Effort_syslog.sql => 02783_parsedatetimebesteffort_syslog.sql} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/queries/0_stateless/{02783_parseDateTimeBestEffort_syslog.reference => 02783_parsedatetimebesteffort_syslog.reference} (100%) rename tests/queries/0_stateless/{02783_parseDateTimeBestEffort_syslog.sql => 02783_parsedatetimebesteffort_syslog.sql} (100%) diff --git a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference b/tests/queries/0_stateless/02783_parsedatetimebesteffort_syslog.reference similarity index 100% rename from tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.reference rename to tests/queries/0_stateless/02783_parsedatetimebesteffort_syslog.reference diff --git a/tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql b/tests/queries/0_stateless/02783_parsedatetimebesteffort_syslog.sql similarity index 100% rename from tests/queries/0_stateless/02783_parseDateTimeBestEffort_syslog.sql rename to tests/queries/0_stateless/02783_parsedatetimebesteffort_syslog.sql From 3e3b8ff5f6b78c6ddd202d154ea9101625c561f1 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 13 Jun 2023 09:14:15 +0000 Subject: [PATCH 0813/1072] More robustness --- src/Functions/FunctionSnowflake.h | 40 ++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/src/Functions/FunctionSnowflake.h b/src/Functions/FunctionSnowflake.h index ace2fc54f09..c7ec6dca27f 100644 --- a/src/Functions/FunctionSnowflake.h +++ b/src/Functions/FunctionSnowflake.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -59,9 +60,20 @@ public: auto res_column = ColumnInt64::create(input_rows_count); auto & res_data = res_column->getData(); - const auto & src_data = typeid_cast &>(src_column).getData(); - for (size_t i = 0; i < input_rows_count; ++i) - res_data[i] = (UInt32(src_data[i]) * 1000 - snowflake_epoch) << time_shift; + if (const auto * src_column_non_const = typeid_cast(&src_column)) + { + const auto & src_data = src_column_non_const->getData(); + for (size_t i = 0; i < input_rows_count; ++i) + res_data[i] = (UInt32(src_data[i]) * 1000 - snowflake_epoch) << time_shift; + } + else if (const auto * src_column_const = typeid_cast(&src_column)) + { + UInt32 src_val = src_column_const->getValue(); + for (size_t i = 0; i < input_rows_count; ++i) + res_data[i] = (src_val * 1000 - snowflake_epoch) << time_shift; + } + else + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal argument for function {}", name); return res_column; } @@ -122,6 +134,9 @@ public: res_data[i] = static_cast( ((src_val >> time_shift) + snowflake_epoch) / 1000); } + else + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal argument for function {}", name); + return res_column; } }; @@ -157,9 +172,20 @@ public: auto res_column = ColumnInt64::create(input_rows_count); auto & res_data = res_column->getData(); - const auto & src_data = typeid_cast &>(src_column).getData(); - for (size_t i = 0; i < input_rows_count; ++i) - res_data[i] = (src_data[i] - snowflake_epoch) << time_shift; + if (const auto * src_column_non_const = typeid_cast(&src_column)) + { + const auto & src_data = src_column_non_const->getData(); + for (size_t i = 0; i < input_rows_count; ++i) + res_data[i] = (UInt32(src_data[i]) * 1000 - snowflake_epoch) << time_shift; + } + else if (const auto * src_column_const = typeid_cast(&src_column)) + { + UInt32 src_val = src_column_const->getValue(); + for (size_t i = 0; i < input_rows_count; ++i) + res_data[i] = (src_val * 1000 - snowflake_epoch) << time_shift; + } + else + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal argument for function {}", name); return res_column; } @@ -219,6 +245,8 @@ public: for (size_t i = 0; i < input_rows_count; ++i) res_data[i] = (src_val >> time_shift) + snowflake_epoch; } + else + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal argument for function {}", name); return res_column; } From eddd932636fdb16802ec0b541a7cb927abcc05ff Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 13 Jun 2023 12:34:26 +0000 Subject: [PATCH 0814/1072] Do not apply projection if read-in-order was enabled. --- .../Optimizations/projectionsCommon.cpp | 3 ++ .../QueryPlan/ReadFromMergeTree.cpp | 5 ++ src/Processors/QueryPlan/ReadFromMergeTree.h | 1 + ...84_projections_read_in_order_bug.reference | 0 .../02784_projections_read_in_order_bug.sql | 48 +++++++++++++++++++ 5 files changed, 57 insertions(+) create mode 100644 tests/queries/0_stateless/02784_projections_read_in_order_bug.reference create mode 100644 tests/queries/0_stateless/02784_projections_read_in_order_bug.sql diff --git a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp index 2f73e14b2a0..cb76ffa84ba 100644 --- a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp +++ b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp @@ -38,6 +38,9 @@ bool canUseProjectionForReadingStep(ReadFromMergeTree * reading) if (reading->isParallelReadingEnabled()) return false; + if (reading->readsInOrder()) + return false; + // Currently projection don't support deduplication when moving parts between shards. if (reading->getContext()->getSettingsRef().allow_experimental_query_deduplication) return false; diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 2415507a6eb..3c38ecbbd3f 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -1427,6 +1427,11 @@ bool ReadFromMergeTree::requestReadingInOrder(size_t prefix_size, int direction, return true; } +bool ReadFromMergeTree::readsInOrder() const +{ + return reader_settings.read_in_order; +} + void ReadFromMergeTree::updatePrewhereInfo(const PrewhereInfoPtr & prewhere_info_value) { query_info.prewhere_info = prewhere_info_value; diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.h b/src/Processors/QueryPlan/ReadFromMergeTree.h index 45beaaaf013..99cbe9d9e50 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.h +++ b/src/Processors/QueryPlan/ReadFromMergeTree.h @@ -161,6 +161,7 @@ public: /// Returns `false` if requested reading cannot be performed. bool requestReadingInOrder(size_t prefix_size, int direction, size_t limit); + bool readsInOrder() const; void updatePrewhereInfo(const PrewhereInfoPtr & prewhere_info_value); diff --git a/tests/queries/0_stateless/02784_projections_read_in_order_bug.reference b/tests/queries/0_stateless/02784_projections_read_in_order_bug.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02784_projections_read_in_order_bug.sql b/tests/queries/0_stateless/02784_projections_read_in_order_bug.sql new file mode 100644 index 00000000000..52a3a6127ac --- /dev/null +++ b/tests/queries/0_stateless/02784_projections_read_in_order_bug.sql @@ -0,0 +1,48 @@ +create table events ( + `organisation_id` UUID, + `session_id` UUID, + `id` UUID DEFAULT generateUUIDv4(), + `timestamp` UInt64, + `payload` String, + `customer_id` UUID, + `call_id` String, + PROJECTION events_by_session_and_org + ( + SELECT * + ORDER BY + organisation_id, + session_id, + timestamp + ), + PROJECTION events_by_session + ( + SELECT * + ORDER BY + session_id, + timestamp + ), + PROJECTION events_by_session_and_customer + ( + SELECT * + ORDER BY + customer_id, + session_id, + timestamp + ), + PROJECTION events_by_call_id + ( + SELECT * + ORDER BY + call_id, + timestamp + )) engine = MergeTree order by (organisation_id, session_id, timestamp) settings index_granularity = 3; + + +#insert into events values (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC'), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), now(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), now(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), now(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), now(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02'), toString(0), reinterpretAsUUID(0), toString(0)); +#insert into events values (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC'), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), now(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), now(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), now(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), now(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02'), toString(0), reinterpretAsUUID(0), toString(0)); + +insert into events values (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)); +insert into events values (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)); + +set read_in_order_two_level_merge_threshold=1; +SELECT id, timestamp, payload FROM events WHERE (organisation_id = reinterpretAsUUID(1)) AND (session_id = reinterpretAsUUID(0)) ORDER BY timestamp, payload, id ASC; From 1a4b7e8ebec920943a39c484576f147c130b00ec Mon Sep 17 00:00:00 2001 From: Val Doroshchuk Date: Tue, 13 Jun 2023 14:36:31 +0200 Subject: [PATCH 0815/1072] MaterializedMySQL: Add missing DROP DATABASE for tests --- .../materialize_with_ddl.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py index 2bbbe9a3f13..7efb9ac54a9 100644 --- a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py +++ b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py @@ -1476,6 +1476,9 @@ def utf8mb4_test(clickhouse_node, mysql_node, service_name): "1\t\U0001F984\n2\t\u2601\n", ) + clickhouse_node.query("DROP DATABASE utf8mb4_test") + mysql_node.query("DROP DATABASE utf8mb4_test") + def system_parts_test(clickhouse_node, mysql_node, service_name): mysql_node.query("DROP DATABASE IF EXISTS system_parts_test") @@ -1504,6 +1507,9 @@ def system_parts_test(clickhouse_node, mysql_node, service_name): clickhouse_node.query("OPTIMIZE TABLE system_parts_test.test") check_active_parts(1) + clickhouse_node.query("DROP DATABASE system_parts_test") + mysql_node.query("DROP DATABASE system_parts_test") + def multi_table_update_test(clickhouse_node, mysql_node, service_name): mysql_node.query("DROP DATABASE IF EXISTS multi_table_update") @@ -1529,6 +1535,8 @@ def multi_table_update_test(clickhouse_node, mysql_node, service_name): check_query(clickhouse_node, "SELECT * FROM multi_table_update.a", "1\tbaz\n") check_query(clickhouse_node, "SELECT * FROM multi_table_update.b", "1\tquux\n") + clickhouse_node.query("DROP DATABASE multi_table_update") + mysql_node.query("DROP DATABASE multi_table_update") def system_tables_test(clickhouse_node, mysql_node, service_name): @@ -1549,6 +1557,9 @@ def system_tables_test(clickhouse_node, mysql_node, service_name): "intDiv(id, 4294967)\tid\tid\n", ) + clickhouse_node.query("DROP DATABASE system_tables_test") + mysql_node.query("DROP DATABASE system_tables_test") + def materialize_with_column_comments_test(clickhouse_node, mysql_node, service_name): mysql_node.query("DROP DATABASE IF EXISTS materialize_with_column_comments_test") From f4ed10c0a28b52f140f542c7ab0b21e1edf9a0c0 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 13 Jun 2023 14:44:39 +0200 Subject: [PATCH 0816/1072] Update src/Storages/StorageReplicatedMergeTree.cpp --- src/Storages/StorageReplicatedMergeTree.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index fafb3b124f2..84eae32495d 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4751,10 +4751,10 @@ void StorageReplicatedMergeTree::read( } else { - header - = InterpreterSelectQuery(modified_query_ast, local_context, SelectQueryOptions(processed_stage).analyze()).getSampleBlock(); modified_query_ast = ClusterProxy::rewriteSelectQuery(local_context, query_info.query, table_id.database_name, table_id.table_name, /*remote_table_function_ptr*/nullptr); + header + = InterpreterSelectQuery(modified_query_ast, local_context, SelectQueryOptions(processed_stage).analyze()).getSampleBlock(); } auto cluster = local_context->getCluster(local_context->getSettingsRef().cluster_for_parallel_replicas); From 46c23b3f8d185bf79cb819c2beff0216ca73c4bd Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 13 Jun 2023 15:46:54 +0200 Subject: [PATCH 0817/1072] Fixed docs check fails --- docs/en/sql-reference/table-functions/azureBlobStorage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/table-functions/azureBlobStorage.md b/docs/en/sql-reference/table-functions/azureBlobStorage.md index 7a362710b9c..8587d9839b8 100644 --- a/docs/en/sql-reference/table-functions/azureBlobStorage.md +++ b/docs/en/sql-reference/table-functions/azureBlobStorage.md @@ -1,5 +1,5 @@ --- -slug: /en/sql-reference/table-functions/azure_blob_storage +slug: /en/sql-reference/table-functions/azureBlobStorageg sidebar_label: azureBlobStorage keywords: [azure blob storage] --- From ab020f9311eea0f657f57493e14191e75e51f8af Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 13 Jun 2023 15:48:42 +0200 Subject: [PATCH 0818/1072] Fixed typo --- docs/en/sql-reference/table-functions/azureBlobStorage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/table-functions/azureBlobStorage.md b/docs/en/sql-reference/table-functions/azureBlobStorage.md index 8587d9839b8..5175aabd5d1 100644 --- a/docs/en/sql-reference/table-functions/azureBlobStorage.md +++ b/docs/en/sql-reference/table-functions/azureBlobStorage.md @@ -1,5 +1,5 @@ --- -slug: /en/sql-reference/table-functions/azureBlobStorageg +slug: /en/sql-reference/table-functions/azureBlobStorage sidebar_label: azureBlobStorage keywords: [azure blob storage] --- From 3a2fa65075ab32a04a377cef632dd1679dea02b0 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 13 Jun 2023 16:02:54 +0200 Subject: [PATCH 0819/1072] fix 'Illegal column timezone' in stress tests --- docker/test/upgrade/run.sh | 6 ++---- tests/config/install.sh | 1 + tests/config/users.d/nonconst_timezone.xml | 7 +++++++ 3 files changed, 10 insertions(+), 4 deletions(-) create mode 100644 tests/config/users.d/nonconst_timezone.xml diff --git a/docker/test/upgrade/run.sh b/docker/test/upgrade/run.sh index 6f7d3999f1d..951c443c30d 100644 --- a/docker/test/upgrade/run.sh +++ b/docker/test/upgrade/run.sh @@ -59,8 +59,7 @@ install_packages previous_release_package_folder # available for dump via clickhouse-local configure -# it contains some new settings, but we can safely remove it -rm /etc/clickhouse-server/config.d/merge_tree.xml +rm /etc/clickhouse-server/users.d/nonconst_timezone.xml start stop @@ -86,8 +85,7 @@ export USE_S3_STORAGE_FOR_MERGE_TREE=1 export ZOOKEEPER_FAULT_INJECTION=0 configure -# it contains some new settings, but we can safely remove it -rm /etc/clickhouse-server/config.d/merge_tree.xml +rm /etc/clickhouse-server/users.d/nonconst_timezone.xml start diff --git a/tests/config/install.sh b/tests/config/install.sh index efa5a9c086e..b2153db1b2c 100755 --- a/tests/config/install.sh +++ b/tests/config/install.sh @@ -78,6 +78,7 @@ ln -sf $SRC_PATH/users.d/enable_blobs_check.xml $DEST_SERVER_PATH/users.d/ ln -sf $SRC_PATH/users.d/marks.xml $DEST_SERVER_PATH/users.d/ ln -sf $SRC_PATH/users.d/insert_keeper_retries.xml $DEST_SERVER_PATH/users.d/ ln -sf $SRC_PATH/users.d/prefetch_settings.xml $DEST_SERVER_PATH/users.d/ +ln -sf $SRC_PATH/users.d/nonconst_timezone.xml $DEST_SERVER_PATH/users.d/ if [[ -n "$USE_NEW_ANALYZER" ]] && [[ "$USE_NEW_ANALYZER" -eq 1 ]]; then ln -sf $SRC_PATH/users.d/analyzer.xml $DEST_SERVER_PATH/users.d/ diff --git a/tests/config/users.d/nonconst_timezone.xml b/tests/config/users.d/nonconst_timezone.xml new file mode 100644 index 00000000000..c7e9de5ab69 --- /dev/null +++ b/tests/config/users.d/nonconst_timezone.xml @@ -0,0 +1,7 @@ + + + + 1 + + + From 8ea7560d898879e74887e042aca0a6c60031191b Mon Sep 17 00:00:00 2001 From: Val Doroshchuk Date: Tue, 13 Jun 2023 16:28:53 +0200 Subject: [PATCH 0820/1072] MaterializedMySQL: Add additional test case to insert_with_modify_binlog_checksum (#50884) --- .../materialize_with_ddl.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py index 2bbbe9a3f13..f5c28832f79 100644 --- a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py +++ b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py @@ -1050,6 +1050,8 @@ def select_without_columns(clickhouse_node, mysql_node, service_name): def insert_with_modify_binlog_checksum(clickhouse_node, mysql_node, service_name): + clickhouse_node.query("DROP DATABASE IF EXISTS test_checksum") + mysql_node.query("DROP DATABASE IF EXISTS test_checksum") mysql_node.query("CREATE DATABASE test_checksum") mysql_node.query("CREATE TABLE test_checksum.t (a INT PRIMARY KEY, b varchar(200))") clickhouse_node.query( @@ -1081,6 +1083,21 @@ def insert_with_modify_binlog_checksum(clickhouse_node, mysql_node, service_name "1\t1111\n2\t2222\n3\t3333\n", ) + clickhouse_node.query("DROP DATABASE test_checksum") + mysql_node.query("SET GLOBAL binlog_checksum=NONE") + clickhouse_node.query( + "CREATE DATABASE test_checksum ENGINE = MaterializeMySQL('{}:3306', 'test_checksum', 'root', 'clickhouse')".format( + service_name + ) + ) + check_query(clickhouse_node, "SHOW TABLES FROM test_checksum FORMAT TSV", "t\n") + mysql_node.query("INSERT INTO test_checksum.t VALUES(4, '4444')") + check_query( + clickhouse_node, + "SELECT * FROM test_checksum.t ORDER BY a FORMAT TSV", + "1\t1111\n2\t2222\n3\t3333\n4\t4444\n", + ) + clickhouse_node.query("DROP DATABASE test_checksum") mysql_node.query("DROP DATABASE test_checksum") From c253c70510008eda1fc3aadb72cf5c8a92e875bb Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 13 Jun 2023 16:33:36 +0200 Subject: [PATCH 0821/1072] Fix for MDXContent --- docs/en/engines/table-engines/mergetree-family/annindexes.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 16e244077a7..6b9c3d6157f 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -202,10 +202,10 @@ CHECK length(vectors) = 256`. Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many tree nodes are inspected during SELECTs. Larger values mean more accurate results at the cost of longer query runtime: -``` sql +```sql SELECT * FROM table_name ORDER BY L2Distance(vectors, Point) LIMIT N -SETTINGS annoy_index_search_k_nodes=100 +SETTINGS annoy_index_search_k_nodes=100; ``` From 263be33297a2ada5e5c5281924b56e5ffaa3f80f Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 13 Jun 2023 16:37:52 +0200 Subject: [PATCH 0822/1072] Fix tests for throttling by allowing more margin of error for trottling event Right now 02703_max_local_write_bandwidth is flaky, and the reason I believe is that the server spent spent sometime somewhere else, which means that the throttler will sleep less. But what is important here is that the overall query duration time matches the expectation, so it is OK to match the LocalWriteThrottlerSleepMicroseconds/LocalReadThrottlerSleepMicroseconds with some error rate. Signed-off-by: Azat Khuzhin --- tests/queries/0_stateless/02703_max_local_read_bandwidth.sh | 2 +- tests/queries/0_stateless/02703_max_local_write_bandwidth.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02703_max_local_read_bandwidth.sh b/tests/queries/0_stateless/02703_max_local_read_bandwidth.sh index d47e2f363bd..c78cd202f1b 100755 --- a/tests/queries/0_stateless/02703_max_local_read_bandwidth.sh +++ b/tests/queries/0_stateless/02703_max_local_read_bandwidth.sh @@ -32,7 +32,7 @@ for read_method in "${read_methods[@]}"; do query_duration_ms >= 7e3, ProfileEvents['ReadBufferFromFileDescriptorReadBytes'] > 8e6, ProfileEvents['LocalReadThrottlerBytes'] > 8e6, - ProfileEvents['LocalReadThrottlerSleepMicroseconds'] > 7e6*0.9 + ProfileEvents['LocalReadThrottlerSleepMicroseconds'] > 7e6*0.5 FROM system.query_log WHERE current_database = '$CLICKHOUSE_DATABASE' AND query_id = '$query_id' AND type != 'QueryStart' " diff --git a/tests/queries/0_stateless/02703_max_local_write_bandwidth.sh b/tests/queries/0_stateless/02703_max_local_write_bandwidth.sh index 41165d35d37..ccde0903278 100755 --- a/tests/queries/0_stateless/02703_max_local_write_bandwidth.sh +++ b/tests/queries/0_stateless/02703_max_local_write_bandwidth.sh @@ -19,7 +19,7 @@ $CLICKHOUSE_CLIENT -nm -q " query_duration_ms >= 7e3, ProfileEvents['WriteBufferFromFileDescriptorWriteBytes'] > 8e6, ProfileEvents['LocalWriteThrottlerBytes'] > 8e6, - ProfileEvents['LocalWriteThrottlerSleepMicroseconds'] > 7e6*0.9 + ProfileEvents['LocalWriteThrottlerSleepMicroseconds'] > 7e6*0.5 FROM system.query_log WHERE current_database = '$CLICKHOUSE_DATABASE' AND query_id = '$query_id' AND type != 'QueryStart' " From 0f4e3a34e846c3a635456dbc8cafa3c12c91155b Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 13 Jun 2023 16:42:30 +0200 Subject: [PATCH 0823/1072] Update 02784_projections_read_in_order_bug.sql --- .../0_stateless/02784_projections_read_in_order_bug.sql | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/queries/0_stateless/02784_projections_read_in_order_bug.sql b/tests/queries/0_stateless/02784_projections_read_in_order_bug.sql index 52a3a6127ac..9595fc9ae08 100644 --- a/tests/queries/0_stateless/02784_projections_read_in_order_bug.sql +++ b/tests/queries/0_stateless/02784_projections_read_in_order_bug.sql @@ -37,10 +37,6 @@ create table events ( timestamp )) engine = MergeTree order by (organisation_id, session_id, timestamp) settings index_granularity = 3; - -#insert into events values (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC'), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), now(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), now(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), now(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), now(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02'), toString(0), reinterpretAsUUID(0), toString(0)); -#insert into events values (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC'), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), now(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), now(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), now(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), now(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02'), toString(0), reinterpretAsUUID(0), toString(0)); - insert into events values (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)); insert into events values (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)); From 52a460df67f38f92e67316115fde6139cb1c7937 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Tue, 13 Jun 2023 16:43:35 +0200 Subject: [PATCH 0824/1072] Tests with parallel replicas are no more "always green" (#50896) --- src/Interpreters/InterpreterSelectQuery.cpp | 6 +-- tests/ci/functional_test_check.py | 42 +++++-------------- .../1_stateful/00013_sorting_of_nested.sql | 3 -- .../1_stateful/00022_merge_prewhere.sql | 2 - .../1_stateful/00042_any_left_join.sql | 2 - .../1_stateful/00043_any_left_join.sql | 2 - .../1_stateful/00044_any_left_join_string.sql | 2 - .../1_stateful/00063_loyalty_joins.sql | 2 - .../00065_loyalty_with_storage_join.sql | 2 - tests/queries/1_stateful/00074_full_join.sql | 2 - .../1_stateful/00075_left_array_join.sql | 2 - ...0079_array_join_not_used_joined_column.sql | 2 - .../1_stateful/00080_array_join_and_union.sql | 2 - .../1_stateful/00084_external_aggregation.sql | 2 - tests/queries/1_stateful/00092_obfuscator.sh | 3 +- .../1_stateful/00096_obfuscator_save_load.sh | 2 - .../00146_aggregate_function_uniq.sql | 2 - .../00149_quantiles_timing_distributed.sql | 2 +- .../00152_insert_different_granularity.sql | 2 +- ...00156_max_execution_speed_sample_merge.sql | 3 -- .../1_stateful/00166_explain_estimate.sql | 2 +- tests/queries/1_stateful/00170_s3_cache.sql | 2 +- ...0171_grouping_aggregated_transform_bug.sql | 2 +- 23 files changed, 19 insertions(+), 74 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index e84a400a220..1f95b1ebf9f 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -477,7 +477,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( /// Check support for JOIN for parallel replicas with custom key if (joined_tables.tablesCount() > 1 && !settings.parallel_replicas_custom_key.value.empty()) { - LOG_WARNING(log, "JOINs are not supported with parallel_replicas_custom_key. Query will be executed without using them."); + LOG_DEBUG(log, "JOINs are not supported with parallel_replicas_custom_key. Query will be executed without using them."); context->setSetting("parallel_replicas_custom_key", String{""}); } @@ -487,7 +487,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( { if (settings.allow_experimental_parallel_reading_from_replicas == 1) { - LOG_WARNING(log, "FINAL modifier is not supported with parallel replicas. Query will be executed without using them."); + LOG_DEBUG(log, "FINAL modifier is not supported with parallel replicas. Query will be executed without using them."); context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0)); context->setSetting("parallel_replicas_custom_key", String{""}); } @@ -503,7 +503,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( { if (settings.allow_experimental_parallel_reading_from_replicas == 1) { - LOG_WARNING(log, "To use parallel replicas with plain MergeTree tables please enable setting `parallel_replicas_for_non_replicated_merge_tree`. For now query will be executed without using them."); + LOG_DEBUG(log, "To use parallel replicas with plain MergeTree tables please enable setting `parallel_replicas_for_non_replicated_merge_tree`. For now query will be executed without using them."); context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0)); } else if (settings.allow_experimental_parallel_reading_from_replicas == 2) diff --git a/tests/ci/functional_test_check.py b/tests/ci/functional_test_check.py index 037bb13f1f8..864c3a81acf 100644 --- a/tests/ci/functional_test_check.py +++ b/tests/ci/functional_test_check.py @@ -378,34 +378,16 @@ def main(): print(f"::notice:: {check_name} Report url: {report_url}") if args.post_commit_status == "commit_status": - if "parallelreplicas" in check_name.lower(): - post_commit_status( - commit, - "success", - report_url, - description, - check_name_with_group, - pr_info, - ) - else: - post_commit_status( - commit, state, report_url, description, check_name_with_group, pr_info - ) + post_commit_status( + commit, state, report_url, description, check_name_with_group, pr_info + ) elif args.post_commit_status == "file": - if "parallelreplicas" in check_name.lower(): - post_commit_status_to_file( - post_commit_path, - description, - "success", - report_url, - ) - else: - post_commit_status_to_file( - post_commit_path, - description, - state, - report_url, - ) + post_commit_status_to_file( + post_commit_path, + description, + state, + report_url, + ) else: raise Exception( f'Unknown post_commit_status option "{args.post_commit_status}"' @@ -423,11 +405,7 @@ def main(): ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) if state != "success": - # Parallel replicas are always green for now - if ( - FORCE_TESTS_LABEL in pr_info.labels - or "parallelreplicas" in check_name.lower() - ): + if FORCE_TESTS_LABEL in pr_info.labels: print(f"'{FORCE_TESTS_LABEL}' enabled, will report success") else: sys.exit(1) diff --git a/tests/queries/1_stateful/00013_sorting_of_nested.sql b/tests/queries/1_stateful/00013_sorting_of_nested.sql index f97120e2b98..7f4a5002a7b 100644 --- a/tests/queries/1_stateful/00013_sorting_of_nested.sql +++ b/tests/queries/1_stateful/00013_sorting_of_nested.sql @@ -1,4 +1 @@ --- Tags: no-parallel-replicas - SELECT ParsedParams.Key1 FROM test.visits FINAL WHERE VisitID != 0 AND notEmpty(ParsedParams.Key1) ORDER BY VisitID LIMIT 10 - diff --git a/tests/queries/1_stateful/00022_merge_prewhere.sql b/tests/queries/1_stateful/00022_merge_prewhere.sql index 400a896d5a8..74a3677b68e 100644 --- a/tests/queries/1_stateful/00022_merge_prewhere.sql +++ b/tests/queries/1_stateful/00022_merge_prewhere.sql @@ -1,5 +1,3 @@ --- Tags: no-parallel-replicas - DROP TABLE IF EXISTS test.merge_hits; CREATE TABLE IF NOT EXISTS test.merge_hits AS test.hits ENGINE = Merge(test, '^hits$'); SELECT count() FROM test.merge_hits WHERE AdvEngineID = 2; diff --git a/tests/queries/1_stateful/00042_any_left_join.sql b/tests/queries/1_stateful/00042_any_left_join.sql index c7c0f0f987a..b87cf88f007 100644 --- a/tests/queries/1_stateful/00042_any_left_join.sql +++ b/tests/queries/1_stateful/00042_any_left_join.sql @@ -1,5 +1,3 @@ --- Tags: no-parallel-replicas - SELECT EventDate, hits, diff --git a/tests/queries/1_stateful/00043_any_left_join.sql b/tests/queries/1_stateful/00043_any_left_join.sql index 6b8cce54051..704d38f727a 100644 --- a/tests/queries/1_stateful/00043_any_left_join.sql +++ b/tests/queries/1_stateful/00043_any_left_join.sql @@ -1,5 +1,3 @@ --- Tags: no-parallel-replicas - SELECT EventDate, count() AS hits, diff --git a/tests/queries/1_stateful/00044_any_left_join_string.sql b/tests/queries/1_stateful/00044_any_left_join_string.sql index ceb7a1c1783..a4f2e9e1b96 100644 --- a/tests/queries/1_stateful/00044_any_left_join_string.sql +++ b/tests/queries/1_stateful/00044_any_left_join_string.sql @@ -1,5 +1,3 @@ --- Tags: no-parallel-replicas - SELECT domain, hits, diff --git a/tests/queries/1_stateful/00063_loyalty_joins.sql b/tests/queries/1_stateful/00063_loyalty_joins.sql index 44f0767a87a..44b575cab85 100644 --- a/tests/queries/1_stateful/00063_loyalty_joins.sql +++ b/tests/queries/1_stateful/00063_loyalty_joins.sql @@ -1,5 +1,3 @@ --- Tags: no-parallel-replicas - SET any_join_distinct_right_table_keys = 1; SET joined_subquery_requires_alias = 0; diff --git a/tests/queries/1_stateful/00065_loyalty_with_storage_join.sql b/tests/queries/1_stateful/00065_loyalty_with_storage_join.sql index 35f0c7b60b9..a0f41f8aa8d 100644 --- a/tests/queries/1_stateful/00065_loyalty_with_storage_join.sql +++ b/tests/queries/1_stateful/00065_loyalty_with_storage_join.sql @@ -1,5 +1,3 @@ --- Tags: no-parallel-replicas - USE test; DROP TABLE IF EXISTS join; diff --git a/tests/queries/1_stateful/00074_full_join.sql b/tests/queries/1_stateful/00074_full_join.sql index c1d9e4be1a4..f049be2a74d 100644 --- a/tests/queries/1_stateful/00074_full_join.sql +++ b/tests/queries/1_stateful/00074_full_join.sql @@ -1,5 +1,3 @@ --- Tags: no-parallel-replicas - set any_join_distinct_right_table_keys = 1; set joined_subquery_requires_alias = 0; diff --git a/tests/queries/1_stateful/00075_left_array_join.sql b/tests/queries/1_stateful/00075_left_array_join.sql index 3540d791157..1fd045a26bf 100644 --- a/tests/queries/1_stateful/00075_left_array_join.sql +++ b/tests/queries/1_stateful/00075_left_array_join.sql @@ -1,4 +1,2 @@ --- Tags: no-parallel-replicas - SELECT UserID, EventTime::DateTime('Asia/Dubai'), pp.Key1, pp.Key2, ParsedParams.Key1 FROM test.hits ARRAY JOIN ParsedParams AS pp WHERE CounterID = 1704509 ORDER BY UserID, EventTime, pp.Key1, pp.Key2 LIMIT 100; SELECT UserID, EventTime::DateTime('Asia/Dubai'), pp.Key1, pp.Key2, ParsedParams.Key1 FROM test.hits LEFT ARRAY JOIN ParsedParams AS pp WHERE CounterID = 1704509 ORDER BY UserID, EventTime, pp.Key1, pp.Key2 LIMIT 100; diff --git a/tests/queries/1_stateful/00079_array_join_not_used_joined_column.sql b/tests/queries/1_stateful/00079_array_join_not_used_joined_column.sql index 9431e1cf596..8e6742bb1e1 100644 --- a/tests/queries/1_stateful/00079_array_join_not_used_joined_column.sql +++ b/tests/queries/1_stateful/00079_array_join_not_used_joined_column.sql @@ -1,5 +1,3 @@ --- Tags: no-parallel-replicas - SELECT PP.Key1 AS `ym:s:paramsLevel1`, sum(arrayAll(`x_1` -> `x_1`= '', ParsedParams.Key2)) AS `ym:s:visits` FROM test.hits ARRAY JOIN ParsedParams AS `PP` WHERE CounterID = 1704509 GROUP BY `ym:s:paramsLevel1` ORDER BY PP.Key1, `ym:s:visits` LIMIT 0, 100; SELECT PP.Key1 AS x1, ParsedParams.Key2 AS x2 FROM test.hits ARRAY JOIN ParsedParams AS PP WHERE CounterID = 1704509 ORDER BY x1, x2 LIMIT 10; SELECT ParsedParams.Key2 AS x FROM test.hits ARRAY JOIN ParsedParams AS PP ORDER BY x DESC LIMIT 10; diff --git a/tests/queries/1_stateful/00080_array_join_and_union.sql b/tests/queries/1_stateful/00080_array_join_and_union.sql index 2f2e5e9324f..d9aa1cc17cc 100644 --- a/tests/queries/1_stateful/00080_array_join_and_union.sql +++ b/tests/queries/1_stateful/00080_array_join_and_union.sql @@ -1,3 +1 @@ --- Tags: no-parallel-replicas - SELECT count() FROM (SELECT Goals.ID FROM test.visits ARRAY JOIN Goals WHERE CounterID = 842440 LIMIT 10 UNION ALL SELECT Goals.ID FROM test.visits ARRAY JOIN Goals WHERE CounterID = 842440 LIMIT 10); diff --git a/tests/queries/1_stateful/00084_external_aggregation.sql b/tests/queries/1_stateful/00084_external_aggregation.sql index 330aa158cf7..b3922eae049 100644 --- a/tests/queries/1_stateful/00084_external_aggregation.sql +++ b/tests/queries/1_stateful/00084_external_aggregation.sql @@ -1,5 +1,3 @@ --- Tags: no-random-settings, no-parallel-replicas - SET max_bytes_before_external_group_by = 200000000; SET max_memory_usage = 1500000000; diff --git a/tests/queries/1_stateful/00092_obfuscator.sh b/tests/queries/1_stateful/00092_obfuscator.sh index f19473f01ac..f9e0098a46c 100755 --- a/tests/queries/1_stateful/00092_obfuscator.sh +++ b/tests/queries/1_stateful/00092_obfuscator.sh @@ -1,6 +1,5 @@ #!/usr/bin/env bash -# Tags: no-parallel-replicas -# clickhouse-local may not work with parallel replicas + CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/1_stateful/00096_obfuscator_save_load.sh b/tests/queries/1_stateful/00096_obfuscator_save_load.sh index 1bb212e1bba..a88dfcdb9b9 100755 --- a/tests/queries/1_stateful/00096_obfuscator_save_load.sh +++ b/tests/queries/1_stateful/00096_obfuscator_save_load.sh @@ -1,6 +1,4 @@ #!/usr/bin/env bash -# Tags: no-parallel-replicas -# clickhouse-local may not work with parallel replicas CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/1_stateful/00146_aggregate_function_uniq.sql b/tests/queries/1_stateful/00146_aggregate_function_uniq.sql index 2cab6e70d22..fd3fde7636d 100644 --- a/tests/queries/1_stateful/00146_aggregate_function_uniq.sql +++ b/tests/queries/1_stateful/00146_aggregate_function_uniq.sql @@ -1,5 +1,3 @@ --- Tags: no-parallel-replicas - SELECT RegionID, uniqHLL12(WatchID) AS X FROM remote('127.0.0.{1,2}', test, hits) GROUP BY RegionID HAVING X > 100000 ORDER BY RegionID ASC; SELECT RegionID, uniqCombined(WatchID) AS X FROM remote('127.0.0.{1,2}', test, hits) GROUP BY RegionID HAVING X > 100000 ORDER BY RegionID ASC; SELECT abs(uniq(WatchID) - uniqExact(WatchID)) FROM test.hits; diff --git a/tests/queries/1_stateful/00149_quantiles_timing_distributed.sql b/tests/queries/1_stateful/00149_quantiles_timing_distributed.sql index 5d2476226ba..6f910646fb7 100644 --- a/tests/queries/1_stateful/00149_quantiles_timing_distributed.sql +++ b/tests/queries/1_stateful/00149_quantiles_timing_distributed.sql @@ -1,4 +1,4 @@ --- Tags: distributed, no-parallel-replicas +-- Tags: distributed SELECT sum(cityHash64(*)) FROM (SELECT CounterID, quantileTiming(0.5)(SendTiming), count() FROM remote('127.0.0.{1,2,3,4,5,6,7,8,9,10}', test.hits) WHERE SendTiming != -1 GROUP BY CounterID); SELECT sum(cityHash64(*)) FROM (SELECT CounterID, quantileTiming(0.5)(SendTiming), count() FROM remote('127.0.0.{1,2,3,4,5,6,7,8,9,10}', test.hits) WHERE SendTiming != -1 GROUP BY CounterID) SETTINGS optimize_aggregation_in_order = 1; diff --git a/tests/queries/1_stateful/00152_insert_different_granularity.sql b/tests/queries/1_stateful/00152_insert_different_granularity.sql index 35483149498..294d71b384b 100644 --- a/tests/queries/1_stateful/00152_insert_different_granularity.sql +++ b/tests/queries/1_stateful/00152_insert_different_granularity.sql @@ -1,4 +1,4 @@ --- Tags: no-tsan, no-replicated-database, no-parallel, no-parallel-replicas +-- Tags: no-tsan, no-replicated-database, no-parallel -- Tag no-replicated-database: Fails due to additional replicas or shards DROP TABLE IF EXISTS fixed_granularity_table; diff --git a/tests/queries/1_stateful/00156_max_execution_speed_sample_merge.sql b/tests/queries/1_stateful/00156_max_execution_speed_sample_merge.sql index 32079111f6c..e325c18200b 100644 --- a/tests/queries/1_stateful/00156_max_execution_speed_sample_merge.sql +++ b/tests/queries/1_stateful/00156_max_execution_speed_sample_merge.sql @@ -1,6 +1,3 @@ --- Tags: no-parallel-replicas --- Merge tables doesn't work with parallel replicas currently - SET max_execution_speed = 4000000, timeout_before_checking_execution_speed = 0; CREATE TEMPORARY TABLE times (t DateTime); diff --git a/tests/queries/1_stateful/00166_explain_estimate.sql b/tests/queries/1_stateful/00166_explain_estimate.sql index abac92ecb2e..c4071271736 100644 --- a/tests/queries/1_stateful/00166_explain_estimate.sql +++ b/tests/queries/1_stateful/00166_explain_estimate.sql @@ -1,4 +1,4 @@ --- Tags: no-replicated-database, no-parallel-replicas +-- Tags: no-replicated-database -- Tag no-replicated-database: Requires investigation EXPLAIN ESTIMATE SELECT count() FROM test.hits WHERE CounterID = 29103473; diff --git a/tests/queries/1_stateful/00170_s3_cache.sql b/tests/queries/1_stateful/00170_s3_cache.sql index 43e85af0bc3..23663a1844d 100644 --- a/tests/queries/1_stateful/00170_s3_cache.sql +++ b/tests/queries/1_stateful/00170_s3_cache.sql @@ -1,4 +1,4 @@ --- Tags: no-parallel, no-random-settings, no-parallel-replicas +-- Tags: no-parallel, no-random-settings -- { echo } diff --git a/tests/queries/1_stateful/00171_grouping_aggregated_transform_bug.sql b/tests/queries/1_stateful/00171_grouping_aggregated_transform_bug.sql index 07788af927e..7068780a1b1 100644 --- a/tests/queries/1_stateful/00171_grouping_aggregated_transform_bug.sql +++ b/tests/queries/1_stateful/00171_grouping_aggregated_transform_bug.sql @@ -1,4 +1,4 @@ --- Tags: distributed, no-parallel-replicas +-- Tags: distributed SELECT sum(cityHash64(*)) FROM (SELECT CounterID, quantileTiming(0.5)(SendTiming), count() FROM remote('127.0.0.{1,2,3,4,5,6,7,8,9,10}', test.hits) WHERE SendTiming != -1 GROUP BY CounterID) SETTINGS max_block_size = 63169; SELECT sum(cityHash64(*)) FROM (SELECT CounterID, quantileTiming(0.5)(SendTiming), count() FROM remote('127.0.0.{1,2,3,4,5,6,7,8,9,10}', test.hits) WHERE SendTiming != -1 GROUP BY CounterID) SETTINGS optimize_aggregation_in_order = 1, max_block_size = 63169; From 2e1f56ae336e198d8f388ce815292ec049a7fdc5 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 13 Jun 2023 14:43:50 +0000 Subject: [PATCH 0825/1072] Address comments --- docs/en/engines/table-engines/special/file.md | 2 +- docs/en/operations/settings/settings.md | 6 +- docs/en/sql-reference/table-functions/file.md | 2 +- src/Storages/HDFS/StorageHDFS.cpp | 90 ++++++++-------- src/Storages/StorageFile.cpp | 29 ++--- src/Storages/StorageS3.cpp | 92 +++++++++------- src/Storages/StorageURL.cpp | 101 ++++++++++-------- src/Storages/StorageURL.h | 2 +- 8 files changed, 177 insertions(+), 147 deletions(-) diff --git a/docs/en/engines/table-engines/special/file.md b/docs/en/engines/table-engines/special/file.md index cf325961b6a..27945b30c03 100644 --- a/docs/en/engines/table-engines/special/file.md +++ b/docs/en/engines/table-engines/special/file.md @@ -99,4 +99,4 @@ For partitioning by month, use the `toYYYYMM(date_column)` expression, where `da - [engine_file_truncate_on_insert](/docs/en/operations/settings/settings.md#engine-file-truncate-on-insert) - allows to truncate file before insert into it. Disabled by default. - [engine_file_allow_create_multiple_files](/docs/en/operations/settings/settings.md#engine_file_allow_create_multiple_files) - allows to create a new file on each insert if format has suffix. Disabled by default. - [engine_file_skip_empty_files](/docs/en/operations/settings/settings.md#engine_file_skip_empty_files) - allows to skip empty files while reading. Disabled by default. -- [storage_file_read_method](/docs/en/operations/settings/settings.md#engine-file-emptyif-not-exists) - method of reading data from storage file, one of: read, pread, mmap (only for clickhouse-local). Default value: `pread` for clickhouse-server, `mmap` for clickhouse-local. +- [storage_file_read_method](/docs/en/operations/settings/settings.md#engine-file-emptyif-not-exists) - method of reading data from storage file, one of: `read`, `pread`, `mmap`. The mmap method does not apply to clickhouse-server (it's intended for clickhouse-local). Default value: `pread` for clickhouse-server, `mmap` for clickhouse-local. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index a138f8f5515..7a28e33bf90 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3332,7 +3332,7 @@ Enables or disables creating a new file on each insert in file engine tables if Possible values: - 0 — `INSERT` query appends new data to the end of the file. -- 1 — `INSERT` query replaces existing content of the file with the new data. +- 1 — `INSERT` query creates a new file. Default value: `0`. @@ -3370,7 +3370,7 @@ initial: `data.Parquet.gz` -> `data.1.Parquet.gz` -> `data.2.Parquet.gz`, etc. Possible values: - 0 — `INSERT` query appends new data to the end of the file. -- 1 — `INSERT` query replaces existing content of the file with the new data. +- 1 — `INSERT` query creates a new file. Default value: `0`. @@ -3402,7 +3402,7 @@ initial: `data.Parquet.gz` -> `data.1.Parquet.gz` -> `data.2.Parquet.gz`, etc. Possible values: - 0 — `INSERT` query appends new data to the end of the file. -- 1 — `INSERT` query replaces existing content of the file with the new data. +- 1 — `INSERT` query creates a new file. Default value: `0`. diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index 749aafb6d00..f25da96fddb 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -18,7 +18,7 @@ file(path [,format] [,structure] [,compression]) **Parameters** -- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-user_files_path). Path to file support following globs in read-only mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc', 'def'` — strings. +- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Path to file support following globs in read-only mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc', 'def'` — strings. - `format` — The [format](/docs/en/interfaces/formats.md#formats) of the file. - `structure` — Structure of the table. Format: `'column1_name column1_type, column2_name column2_type, ...'`. - `compression` — The existing compression type when used in a `SELECT` query, or the desired compression type when used in an `INSERT` query. The supported compression types are `gz`, `br`, `xz`, `zst`, `lz4`, and `bz2`. diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 9ea1d805db5..dd3e8fecfaa 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -140,14 +140,6 @@ namespace return LSWithRegexpMatching("/", fs, path_from_uri); } - - size_t getFileSize(const String & path_from_uri, const String & uri_without_path, ContextPtr context) - { - HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef()); - HDFSFSPtr fs = createHDFSFS(builder.get()); - auto * info = hdfsGetPathInfo(fs.get(), path_from_uri.data()); - return info->mSize; - } } StorageHDFS::StorageHDFS( @@ -218,26 +210,36 @@ ColumnsDescription StorageHDFS::getTableStructureFromData( ReadBufferIterator read_buffer_iterator = [&, my_uri_without_path = uri_without_path, it = paths_with_info.begin(), first = true]( - ColumnsDescription & columns) mutable -> std::unique_ptr + ColumnsDescription &) mutable -> std::unique_ptr { - if (it == paths_with_info.end()) + PathWithInfo path_with_info; + std::unique_ptr buf; + while (true) { - if (first) - throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because all files are empty. " - "You must specify table structure manually", format); - return nullptr; + if (it == paths_with_info.end()) + { + if (first) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because all files are empty. " + "You must specify table structure manually", format); + return nullptr; + } + + path_with_info = *it++; + if (ctx->getSettingsRef().hdfs_skip_empty_files && path_with_info.info && path_with_info.info->size == 0) + continue; + + auto compression = chooseCompressionMethod(path_with_info.path, compression_method); + auto impl = std::make_unique(my_uri_without_path, path_with_info.path, ctx->getGlobalContext()->getConfigRef(), ctx->getReadSettings()); + const Int64 zstd_window_log_max = ctx->getSettingsRef().zstd_window_log_max; + buf = wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)); + + if (!ctx->getSettingsRef().hdfs_skip_empty_files || !buf->eof()) + { + first = false; + return buf; + } } - - auto path_with_info = *it++; - if (ctx->getSettingsRef().hdfs_skip_empty_files && path_with_info.info && path_with_info.info->size == 0) - return read_buffer_iterator(columns); - - first = false; - auto compression = chooseCompressionMethod(path_with_info.path, compression_method); - auto impl = std::make_unique(my_uri_without_path, path_with_info.path, ctx->getGlobalContext()->getConfigRef(), ctx->getReadSettings()); - const Int64 zstd_window_log_max = ctx->getSettingsRef().zstd_window_log_max; - return wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)); }; ColumnsDescription columns; @@ -362,26 +364,28 @@ HDFSSource::HDFSSource( bool HDFSSource::initialize() { - auto path_with_info = (*file_iterator)(); - if (path_with_info.path.empty()) - return false; - - current_path = path_with_info.path; - const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(current_path); - - if (getContext()->getSettingsRef().hdfs_skip_empty_files) + StorageHDFS::PathWithInfo path_with_info; + bool skip_empty_files = getContext()->getSettingsRef().hdfs_skip_empty_files; + while (true) { - auto file_size = path_with_info.info ? path_with_info.info->size : getFileSize(path_from_uri, uri_without_path, getContext()); - /// If file is empty and hdfs_skip_empty_files=1, skip it and go to the next file. - if (file_size == 0) - return initialize(); - } + path_with_info = (*file_iterator)(); + if (path_with_info.path.empty()) + return false; - auto compression = chooseCompressionMethod(path_from_uri, storage->compression_method); - auto impl = std::make_unique( - uri_without_path, path_from_uri, getContext()->getGlobalContext()->getConfigRef(), getContext()->getReadSettings()); - const Int64 zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; - read_buf = wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)); + if (path_with_info.info && skip_empty_files && path_with_info.info->size == 0) + continue; + + current_path = path_with_info.path; + const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(current_path); + + auto compression = chooseCompressionMethod(path_from_uri, storage->compression_method); + auto impl = std::make_unique( + uri_without_path, path_from_uri, getContext()->getGlobalContext()->getConfigRef(), getContext()->getReadSettings()); + const Int64 zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; + read_buf = wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)); + if (!skip_empty_files || !read_buf->eof()) + break; + } auto input_format = getContext()->getInputFormat(storage->format_name, *read_buf, block_for_format, max_block_size); diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 7fc143a6122..06f9d071706 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -404,21 +404,26 @@ ColumnsDescription StorageFile::getTableStructureFromFile( if (context->getSettingsRef().schema_inference_use_cache_for_file) columns_from_cache = tryGetColumnsFromCache(paths, format, format_settings, context); - ReadBufferIterator read_buffer_iterator = [&, it = paths.begin(), first = true](ColumnsDescription & columns) mutable -> std::unique_ptr + ReadBufferIterator read_buffer_iterator = [&, it = paths.begin(), first = true](ColumnsDescription &) mutable -> std::unique_ptr { - if (it == paths.end()) + String path; + struct stat file_stat; + do { - if (first) - throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because all files are empty. You must specify table structure manually", - format); - return nullptr; - } + if (it == paths.end()) + { + if (first) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because all files are empty. You must specify table structure manually", + format); + return nullptr; + } - auto path = *it++; - auto file_stat = getFileStat(path, false, -1, "File"); - if (context->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) - return read_buffer_iterator(columns); + path = *it++; + file_stat = getFileStat(path, false, -1, "File"); + } + while (context->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0); first = false; return createReadBuffer(path, file_stat, false, -1, compression_method, context); diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index c30973d99e1..5c5895744ac 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -575,19 +575,21 @@ StorageS3Source::StorageS3Source( StorageS3Source::ReaderHolder StorageS3Source::createReader() { - auto [current_key, info] = (*file_iterator)(); - if (current_key.empty()) - return {}; + KeyWithInfo key_with_info; + size_t object_size; + do + { + key_with_info = (*file_iterator)(); + if (key_with_info.key.empty()) + return {}; - size_t object_size = info ? info->size : S3::getObjectSize(*client, bucket, current_key, version_id, request_settings); + object_size = key_with_info.info ? key_with_info.info->size : S3::getObjectSize(*client, bucket, key_with_info.key, version_id, request_settings); + } + while (getContext()->getSettingsRef().s3_skip_empty_files && object_size == 0); - /// If object is empty and s3_skip_empty_files=1, skip it and go to the next key. - if (getContext()->getSettingsRef().s3_skip_empty_files && object_size == 0) - return createReader(); + auto compression_method = chooseCompressionMethod(key_with_info.key, compression_hint); - auto compression_method = chooseCompressionMethod(current_key, compression_hint); - - auto read_buf = createS3ReadBuffer(current_key, object_size); + auto read_buf = createS3ReadBuffer(key_with_info.key, object_size); auto input_format = FormatFactory::instance().getInput( format, *read_buf, sample_block, getContext(), max_block_size, format_settings, std::nullopt, std::nullopt, @@ -606,7 +608,7 @@ StorageS3Source::ReaderHolder StorageS3Source::createReader() auto pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); auto current_reader = std::make_unique(*pipeline); - return ReaderHolder{fs::path(bucket) / current_key, std::move(read_buf), std::move(pipeline), std::move(current_reader)}; + return ReaderHolder{fs::path(bucket) / key_with_info.key, std::move(read_buf), std::move(pipeline), std::move(current_reader)}; } std::future StorageS3Source::createReaderAsync() @@ -1451,41 +1453,53 @@ ColumnsDescription StorageS3::getTableStructureFromDataImpl( ReadBufferIterator read_buffer_iterator = [&, first = true](ColumnsDescription & cached_columns) mutable -> std::unique_ptr { - auto [key, info] = (*file_iterator)(); + StorageS3Source::KeyWithInfo key_with_info; + std::unique_ptr buf; - if (key.empty()) + while (true) { - if (first) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because there are no files with provided path " - "in S3 or all files are empty. You must specify table structure manually", configuration.format); + key_with_info = (*file_iterator)(); - return nullptr; - } - - if (ctx->getSettingsRef().s3_skip_empty_files && info->size == 0) - return read_buffer_iterator(cached_columns); - - /// S3 file iterator could get new keys after new iteration, check them in schema cache. - if (ctx->getSettingsRef().schema_inference_use_cache_for_s3 && read_keys.size() > prev_read_keys_size) - { - columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end(), configuration, format_settings, ctx); - prev_read_keys_size = read_keys.size(); - if (columns_from_cache) + if (key_with_info.key.empty()) { - cached_columns = *columns_from_cache; + if (first) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because there are no files with provided path " + "in S3 or all files are empty. You must specify table structure manually", + configuration.format); + return nullptr; } - } - first = false; - int zstd_window_log_max = static_cast(ctx->getSettingsRef().zstd_window_log_max); - return wrapReadBufferWithCompressionMethod( - std::make_unique( - configuration.client, configuration.url.bucket, key, configuration.url.version_id, configuration.request_settings, ctx->getReadSettings()), - chooseCompressionMethod(key, configuration.compression_method), - zstd_window_log_max); + /// S3 file iterator could get new keys after new iteration, check them in schema cache. + if (ctx->getSettingsRef().schema_inference_use_cache_for_s3 && read_keys.size() > prev_read_keys_size) + { + columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end(), configuration, format_settings, ctx); + prev_read_keys_size = read_keys.size(); + if (columns_from_cache) + { + cached_columns = *columns_from_cache; + return nullptr; + } + } + + if (ctx->getSettingsRef().s3_skip_empty_files && key_with_info.info && key_with_info.info->size == 0) + continue; + + int zstd_window_log_max = static_cast(ctx->getSettingsRef().zstd_window_log_max); + buf = wrapReadBufferWithCompressionMethod( + std::make_unique( + configuration.client, configuration.url.bucket, key_with_info.key, configuration.url.version_id, configuration.request_settings, ctx->getReadSettings()), + chooseCompressionMethod(key_with_info.key, configuration.compression_method), + zstd_window_log_max); + + if (!ctx->getSettingsRef().s3_skip_empty_files || !buf->eof()) + { + first = false; + return buf; + } + } }; ColumnsDescription columns; diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index fc5525b42d2..4e75a4d54cb 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -241,30 +241,34 @@ StorageURLSource::StorageURLSource( /// Lazy initialization. We should not perform requests in constructor, because we need to do it in query pipeline. initialize = [=, this]() { - const auto current_uri_options = (*uri_iterator)(); - if (current_uri_options.empty()) - return false; + std::vector current_uri_options; + std::pair> uri_and_buf; + do + { + current_uri_options = (*uri_iterator)(); + if (current_uri_options.empty()) + return false; - auto first_option = uri_options.begin(); - auto [actual_uri, buf] = getFirstAvailableURIAndReadBuffer( - first_option, - current_uri_options.end(), - context, - params, - http_method, - callback, - timeouts, - credentials, - headers, - glob_url, - current_uri_options.size() == 1); + auto first_option = current_uri_options.cbegin(); + uri_and_buf = getFirstAvailableURIAndReadBuffer( + first_option, + current_uri_options.end(), + context, + params, + http_method, + callback, + timeouts, + credentials, + headers, + glob_url, + current_uri_options.size() == 1); - /// If file is empty and engine_url_skip_empty_files=1, skip it and go to the next file. - if (context->getSettingsRef().engine_url_skip_empty_files && getFileSizeFromReadBuffer(*buf) == 0) - return initialize(); + /// If file is empty and engine_url_skip_empty_files=1, skip it and go to the next file. + } + while (context->getSettingsRef().engine_url_skip_empty_files && uri_and_buf.second->eof()); - curr_uri = actual_uri; - read_buf = std::move(buf); + curr_uri = uri_and_buf.first; + read_buf = std::move(uri_and_buf.second); try { @@ -347,7 +351,7 @@ Chunk StorageURLSource::generate() return {}; } -std::tuple> StorageURLSource::getFirstAvailableURIAndReadBuffer( +std::pair> StorageURLSource::getFirstAvailableURIAndReadBuffer( std::vector::const_iterator & option, const std::vector::const_iterator & end, ContextPtr context, @@ -590,38 +594,41 @@ ColumnsDescription IStorageURLBase::getTableStructureFromData( if (context->getSettingsRef().schema_inference_use_cache_for_url) columns_from_cache = tryGetColumnsFromCache(urls_to_check, headers, credentials, format, format_settings, context); - ReadBufferIterator read_buffer_iterator = [&, it = urls_to_check.cbegin(), first = true](ColumnsDescription & columns) mutable -> std::unique_ptr + ReadBufferIterator read_buffer_iterator = [&, it = urls_to_check.cbegin(), first = true](ColumnsDescription &) mutable -> std::unique_ptr { - if (it == urls_to_check.cend()) + std::pair> uri_and_buf; + do { - if (first) - throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because all files are empty. " - "You must specify table structure manually", format); - return nullptr; - } + if (it == urls_to_check.cend()) + { + if (first) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because all files are empty. " + "You must specify table structure manually", + format); + return nullptr; + } - auto [_, buf] = StorageURLSource::getFirstAvailableURIAndReadBuffer( - it, - urls_to_check.cend(), - context, - {}, - Poco::Net::HTTPRequest::HTTP_GET, - {}, - getHTTPTimeouts(context), - credentials, - headers, - false, - false); + uri_and_buf = StorageURLSource::getFirstAvailableURIAndReadBuffer( + it, + urls_to_check.cend(), + context, + {}, + Poco::Net::HTTPRequest::HTTP_GET, + {}, + getHTTPTimeouts(context), + credentials, + headers, + false, + false); - ++it; - - if (context->getSettingsRef().engine_url_skip_empty_files && buf_factory->getFileSize() == 0) - return read_buffer_iterator(columns); + ++it; + } while (context->getSettingsRef().engine_url_skip_empty_files && uri_and_buf.second->eof()); first = false; return wrapReadBufferWithCompressionMethod( - std::move(buf), + std::move(uri_and_buf.second), compression_method, static_cast(context->getSettingsRef().zstd_window_log_max)); }; diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index 50928ed6962..a5c1174377b 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -183,7 +183,7 @@ public: static Block getHeader(Block sample_block, const std::vector & requested_virtual_columns); - static std::tuple> getFirstAvailableURIAndReadBuffer( + static std::pair> getFirstAvailableURIAndReadBuffer( std::vector::const_iterator & option, const std::vector::const_iterator & end, ContextPtr context, From 46fbe7fb26d06abdb8f5e21f00d5dd215a084b9b Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 13 Jun 2023 15:01:51 +0000 Subject: [PATCH 0826/1072] 01746_convert_type_with_default: Temporarily disable flaky test --- .../0_stateless/01746_convert_type_with_default.reference | 1 - tests/queries/0_stateless/01746_convert_type_with_default.sql | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.reference b/tests/queries/0_stateless/01746_convert_type_with_default.reference index e5aa42e6116..0edea4de31e 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.reference +++ b/tests/queries/0_stateless/01746_convert_type_with_default.reference @@ -40,7 +40,6 @@ 1970-01-20 1970-01-20 2149-06-06 -1970-01-02 2023-05-30 2023-05-30 2023-05-30 14:38:20 diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.sql b/tests/queries/0_stateless/01746_convert_type_with_default.sql index e6e420ae4c0..c74b185f7fd 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.sql +++ b/tests/queries/0_stateless/01746_convert_type_with_default.sql @@ -54,7 +54,7 @@ select toDateOrDefault(cast(19 as Int256)); select toDateOrDefault(cast(19 as UInt256)); select toDateOrDefault(65535); -select toDateOrDefault(122400); +-- select toDateOrDefault(122400); select toDateOrDefault(19507, '2000-01-01'::Date); select toDateOrDefault(-1, '2023-05-30'::Date); @@ -80,4 +80,4 @@ select toDateTimeOrDefault(cast(19 as Int128), 'UTC'); select toDateTimeOrDefault(cast(19 as UInt128), 'UTC'); select toDateTimeOrDefault(cast(19 as Int256), 'UTC'); -select toDateTimeOrDefault(cast(19 as UInt256), 'UTC'); \ No newline at end of file +select toDateTimeOrDefault(cast(19 as UInt256), 'UTC'); From 57cdd3a25d25ca8274e3b68cf75cbaa9bf94daa7 Mon Sep 17 00:00:00 2001 From: rfraposa Date: Tue, 13 Jun 2023 09:13:13 -0600 Subject: [PATCH 0827/1072] Update annindexes.md --- .../table-engines/mergetree-family/annindexes.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 16e244077a7..fe971571419 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -54,7 +54,7 @@ CREATE TABLE table ( `id` Int64, `vectors` Array(Float32), - INDEX vectors TYPE () [GRANULARITY ] + INDEX [ann_index_name vectors TYPE [ann_index_type]([ann_index_parameters]) [GRANULARITY [N]] ) ENGINE = MergeTree ORDER BY id; @@ -67,7 +67,7 @@ CREATE TABLE table ( `id` Int64, `vectors` Tuple(Float32[, Float32[, ...]]), - INDEX vectors TYPE () [GRANULARITY ] + INDEX [ann_index_name] vectors TYPE [ann_index_type]([ann_index_parameters]) [GRANULARITY [N]] ) ENGINE = MergeTree ORDER BY id; @@ -114,7 +114,7 @@ without `LIMIT` clause cannot utilize ANN indexes. Also ANN indexes are only use approximate neighbor search. **Differences to Skip Indexes** Similar to regular [skip indexes](https://clickhouse.com/docs/en/optimize/skipping-indexes), ANN indexes are -constructed over granules and each indexed block consists of `GRANULARITY = `-many granules (`` = 1 by default for normal skip +constructed over granules and each indexed block consists of `GRANULARITY = [N]`-many granules (`[N]` = 1 by default for normal skip indexes). For example, if the primary index granularity of the table is 8192 (setting `index_granularity = 8192`) and `GRANULARITY = 2`, then each indexed block will contain 16384 rows. However, data structures and algorithms for approximate neighborhood search (usually provided by external libraries) are inherently row-oriented. They store a compact representation of a set of rows and also return rows for @@ -130,7 +130,7 @@ skip data at the granularity of index blocks. The `GRANULARITY` parameter determines how many ANN sub-indexes are created. Bigger `GRANULARITY` values mean fewer but larger ANN sub-indexes, up to the point where a column (or a column's data part) has only a single sub-index. In that case, the sub-index has a "global" view of all column rows and can directly return all granules of the column (part) with relevant rows (there are at most `LIMIT -`-many such granules). In a second step, ClickHouse will load these granules and identify the actually best rows by performing a +[N]`-many such granules). In a second step, ClickHouse will load these granules and identify the actually best rows by performing a brute-force distance calculation over all rows of the granules. With a small `GRANULARITY` value, each of the sub-indexes returns up to `LIMIT N`-many granules. As a result, more granules need to be loaded and post-filtered. Note that the search accuracy is with both cases equally good, only the processing performance differs. It is generally recommended to use a large `GRANULARITY` for ANN indexes and fall @@ -169,7 +169,7 @@ CREATE TABLE table ( id Int64, vectors Array(Float32), - INDEX vectors TYPE annoy([Distance[, NumTrees]]) [GRANULARITY N] + INDEX [ann_index_name] vectors TYPE annoy([Distance[, NumTrees]]) [GRANULARITY N] ) ENGINE = MergeTree ORDER BY id; @@ -182,7 +182,7 @@ CREATE TABLE table ( id Int64, vectors Tuple(Float32[, Float32[, ...]]), - INDEX vectors TYPE annoy([Distance[, NumTrees]]) [GRANULARITY N] + INDEX [ann_index_name] vectors TYPE annoy([Distance[, NumTrees]]) [GRANULARITY N] ) ENGINE = MergeTree ORDER BY id; From 20ea87e527eb76bb296a46f0deab59abdd4a4325 Mon Sep 17 00:00:00 2001 From: Dan Roscigno Date: Tue, 13 Jun 2023 11:17:33 -0400 Subject: [PATCH 0828/1072] Update annindexes.md Don't break code snippets across lines. --- docs/en/engines/table-engines/mergetree-family/annindexes.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 16e244077a7..20e49f1c34c 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -129,8 +129,8 @@ skip data at the granularity of index blocks. The `GRANULARITY` parameter determines how many ANN sub-indexes are created. Bigger `GRANULARITY` values mean fewer but larger ANN sub-indexes, up to the point where a column (or a column's data part) has only a single sub-index. In that case, the sub-index has a -"global" view of all column rows and can directly return all granules of the column (part) with relevant rows (there are at most `LIMIT -`-many such granules). In a second step, ClickHouse will load these granules and identify the actually best rows by performing a +"global" view of all column rows and can directly return all granules of the column (part) with relevant rows (there are at most +`LIMIT `-many such granules). In a second step, ClickHouse will load these granules and identify the actually best rows by performing a brute-force distance calculation over all rows of the granules. With a small `GRANULARITY` value, each of the sub-indexes returns up to `LIMIT N`-many granules. As a result, more granules need to be loaded and post-filtered. Note that the search accuracy is with both cases equally good, only the processing performance differs. It is generally recommended to use a large `GRANULARITY` for ANN indexes and fall From b850f1b9995d981d7ec047df8261fec302d3020a Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Tue, 13 Jun 2023 11:26:12 -0400 Subject: [PATCH 0829/1072] fix broken line --- docs/en/engines/table-engines/mergetree-family/annindexes.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index fe971571419..71e7e008bf2 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -129,8 +129,7 @@ skip data at the granularity of index blocks. The `GRANULARITY` parameter determines how many ANN sub-indexes are created. Bigger `GRANULARITY` values mean fewer but larger ANN sub-indexes, up to the point where a column (or a column's data part) has only a single sub-index. In that case, the sub-index has a -"global" view of all column rows and can directly return all granules of the column (part) with relevant rows (there are at most `LIMIT -[N]`-many such granules). In a second step, ClickHouse will load these granules and identify the actually best rows by performing a +"global" view of all column rows and can directly return all granules of the column (part) with relevant rows (there are at most `LIMIT [N]`-many such granules). In a second step, ClickHouse will load these granules and identify the actually best rows by performing a brute-force distance calculation over all rows of the granules. With a small `GRANULARITY` value, each of the sub-indexes returns up to `LIMIT N`-many granules. As a result, more granules need to be loaded and post-filtered. Note that the search accuracy is with both cases equally good, only the processing performance differs. It is generally recommended to use a large `GRANULARITY` for ANN indexes and fall From 76f69f2b44ba23dbb0afd50f26dd3fd62352a381 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 13 Jun 2023 15:52:06 +0000 Subject: [PATCH 0830/1072] Revert overengineering --- src/Functions/FunctionSnowflake.h | 34 ++++++------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/src/Functions/FunctionSnowflake.h b/src/Functions/FunctionSnowflake.h index c7ec6dca27f..b57e21e9a50 100644 --- a/src/Functions/FunctionSnowflake.h +++ b/src/Functions/FunctionSnowflake.h @@ -60,20 +60,9 @@ public: auto res_column = ColumnInt64::create(input_rows_count); auto & res_data = res_column->getData(); - if (const auto * src_column_non_const = typeid_cast(&src_column)) - { - const auto & src_data = src_column_non_const->getData(); - for (size_t i = 0; i < input_rows_count; ++i) - res_data[i] = (UInt32(src_data[i]) * 1000 - snowflake_epoch) << time_shift; - } - else if (const auto * src_column_const = typeid_cast(&src_column)) - { - UInt32 src_val = src_column_const->getValue(); - for (size_t i = 0; i < input_rows_count; ++i) - res_data[i] = (src_val * 1000 - snowflake_epoch) << time_shift; - } - else - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal argument for function {}", name); + const auto & src_data = typeid_cast(src_column).getData(); + for (size_t i = 0; i < input_rows_count; ++i) + res_data[i] = (Int64(src_data[i]) * 1000 - snowflake_epoch) << time_shift; return res_column; } @@ -172,20 +161,9 @@ public: auto res_column = ColumnInt64::create(input_rows_count); auto & res_data = res_column->getData(); - if (const auto * src_column_non_const = typeid_cast(&src_column)) - { - const auto & src_data = src_column_non_const->getData(); - for (size_t i = 0; i < input_rows_count; ++i) - res_data[i] = (UInt32(src_data[i]) * 1000 - snowflake_epoch) << time_shift; - } - else if (const auto * src_column_const = typeid_cast(&src_column)) - { - UInt32 src_val = src_column_const->getValue(); - for (size_t i = 0; i < input_rows_count; ++i) - res_data[i] = (src_val * 1000 - snowflake_epoch) << time_shift; - } - else - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal argument for function {}", name); + const auto & src_data = typeid_cast &>(src_column).getData(); + for (size_t i = 0; i < input_rows_count; ++i) + res_data[i] = (src_data[i] - snowflake_epoch) << time_shift; return res_column; } From 0bc624dc0291896001d45408e5316d23e28b3cc1 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 13 Jun 2023 17:53:19 +0200 Subject: [PATCH 0831/1072] Fix the statless tests image for old commits --- docker/test/stateless/run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index c0acb0291a4..21cb3168083 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -16,7 +16,7 @@ dpkg -i package_folder/clickhouse-client_*.deb ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test # shellcheck disable=SC1091 -source /usr/share/clickhouse-test/ci/attach_gdb.lib +source /usr/share/clickhouse-test/ci/attach_gdb.lib || true # FIXME: to not break old builds, clean on 2023-09-01 # install test configs /usr/share/clickhouse-test/config/install.sh @@ -88,7 +88,7 @@ fi sleep 5 -attach_gdb_to_clickhouse +attach_gdb_to_clickhouse || true # FIXME: to not break old builds, clean on 2023-09-01 function run_tests() { From a01056f67c787e069ca173cb63fafbfc5c6e5c96 Mon Sep 17 00:00:00 2001 From: santrancisco Date: Wed, 14 Jun 2023 02:33:48 +1000 Subject: [PATCH 0832/1072] Update orc submodule --- contrib/orc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/orc b/contrib/orc index c5d7755ba0b..f2c191f9653 160000 --- a/contrib/orc +++ b/contrib/orc @@ -1 +1 @@ -Subproject commit c5d7755ba0b9a95631c8daea4d094101f26ec761 +Subproject commit f2c191f9653a5ddbca016e024ca0fb61508f5eeb From 945981e5f69145908aef64819da58725ca8e67e4 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 13 Jun 2023 16:12:33 +0200 Subject: [PATCH 0833/1072] Fix tests sanity checks In #43147 the "tests" had been added to EXCLUDE_DIRS, and the reason for this is that there was some C++ code to ignore [1], however it also ignores snaity check for query_log. [1]: https://s3.amazonaws.com/clickhouse-test-reports/43147/63de577172ee024a08e76db69f5000568673db48/style_check.html v2: check-style: ignore $EXCLUDE_DIRS for some other sanity checks of tests Signed-off-by: Azat Khuzhin --- utils/check-style/check-style | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/utils/check-style/check-style b/utils/check-style/check-style index afaf2ee6d48..bd3ee8e02d6 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -13,7 +13,7 @@ # and then to run formatter only for the specified files. ROOT_PATH=$(git rev-parse --show-toplevel) -EXCLUDE_DIRS='build/|integration/|widechar_width/|glibc-compatibility/|poco/|memcpy/|consistent-hashing|benchmark|tests/|utils/keeper-bench/example.yaml' +EXCLUDE_DIRS='build/|integration/|widechar_width/|glibc-compatibility/|poco/|memcpy/|consistent-hashing|benchmark|tests/.*.cpp|utils/keeper-bench/example.yaml' # From [1]: # But since array_to_string_internal() in array.c still loops over array @@ -163,14 +163,12 @@ find $ROOT_PATH -not -path $ROOT_PATH'/contrib*' \( -name '*.yaml' -or -name '*. # Tests should not be named with "fail" in their names. It makes looking at the results less convenient. find $ROOT_PATH/tests/queries -iname '*fail*' | - grep -vP $EXCLUDE_DIRS | grep . && echo 'Tests should not be named with "fail" in their names. It makes looking at the results less convenient when you search for "fail" substring in browser.' # Queries to system.query_log/system.query_thread_log should have current_database = currentDatabase() condition # NOTE: it is not that accurate, but at least something. tests_with_query_log=( $( find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' | - grep -vP $EXCLUDE_DIRS | xargs grep --with-filename -e system.query_log -e system.query_thread_log | cut -d: -f1 | sort -u ) ) for test_case in "${tests_with_query_log[@]}"; do @@ -205,7 +203,6 @@ tables_with_database_column=( # NOTE: it is not that accuate, but at least something. tests_with_database_column=( $( find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' | - grep -vP $EXCLUDE_DIRS | xargs grep --with-filename $(printf -- "-e %s " "${tables_with_database_column[@]}") | grep -v -e ':--' -e ':#' | cut -d: -f1 | sort -u @@ -225,7 +222,6 @@ done # NOTE: it is not that accuate, but at least something. tests_with_replicated_merge_tree=( $( find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' | - grep -vP $EXCLUDE_DIRS | xargs grep --with-filename -e "Replicated.*MergeTree[ ]*(.*" | cut -d: -f1 | sort -u ) ) for test_case in "${tests_with_replicated_merge_tree[@]}"; do From dc6810601a65cf15a87459cdc72e5258d69949d2 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 13 Jun 2023 16:15:30 +0200 Subject: [PATCH 0834/1072] Remove DROP TABEL system.query_log from tests This is a very ugly hack that breaks artifacts, since after this query_log is incomplete in artifacts. Signed-off-by: Azat Khuzhin --- .../02494_query_cache_case_agnostic_matching.sql | 4 ++-- .../0_stateless/02494_query_cache_events.reference | 1 + tests/queries/0_stateless/02494_query_cache_events.sql | 9 +++++---- .../0_stateless/02494_query_cache_normalize_ast.sql | 4 ++-- .../02494_query_cache_passive_usage.reference | 1 + .../0_stateless/02494_query_cache_passive_usage.sql | 10 +++++----- 6 files changed, 16 insertions(+), 13 deletions(-) diff --git a/tests/queries/0_stateless/02494_query_cache_case_agnostic_matching.sql b/tests/queries/0_stateless/02494_query_cache_case_agnostic_matching.sql index 9f26d7759de..7dbd79059af 100644 --- a/tests/queries/0_stateless/02494_query_cache_case_agnostic_matching.sql +++ b/tests/queries/0_stateless/02494_query_cache_case_agnostic_matching.sql @@ -1,9 +1,8 @@ -- Tags: no-parallel -- Tag no-parallel: Messes with internal cache --- Start with empty query cache (QC) and query log +-- Start with empty query cache (QC) SYSTEM DROP QUERY CACHE; -DROP TABLE system.query_log SYNC; -- Insert an entry into the query cache. SELECT 1 SETTINGS use_query_cache = true; @@ -22,6 +21,7 @@ SYSTEM FLUSH LOGS; SELECT ProfileEvents['QueryCacheHits'], ProfileEvents['QueryCacheMisses'] FROM system.query_log WHERE type = 'QueryFinish' + AND current_database = currentDatabase() AND query = 'select 1 SETTINGS use_query_cache = true;'; SYSTEM DROP QUERY CACHE; diff --git a/tests/queries/0_stateless/02494_query_cache_events.reference b/tests/queries/0_stateless/02494_query_cache_events.reference index db60d3699e0..9bcd2820f27 100644 --- a/tests/queries/0_stateless/02494_query_cache_events.reference +++ b/tests/queries/0_stateless/02494_query_cache_events.reference @@ -3,4 +3,5 @@ 0 1 --- 1 +0 1 1 0 diff --git a/tests/queries/0_stateless/02494_query_cache_events.sql b/tests/queries/0_stateless/02494_query_cache_events.sql index 900b68f5eb2..05c0acad4b8 100644 --- a/tests/queries/0_stateless/02494_query_cache_events.sql +++ b/tests/queries/0_stateless/02494_query_cache_events.sql @@ -1,9 +1,8 @@ -- Tags: no-parallel -- Tag no-parallel: Messes with internal cache --- Start with empty query cache QC and query log +-- Start with empty query cache QC SYSTEM DROP QUERY CACHE; -DROP TABLE system.query_log SYNC; -- Run a query with QC on. The first execution is a QC miss. SELECT '---'; @@ -13,6 +12,7 @@ SYSTEM FLUSH LOGS; SELECT ProfileEvents['QueryCacheHits'], ProfileEvents['QueryCacheMisses'] FROM system.query_log WHERE type = 'QueryFinish' + AND current_database = currentDatabase() AND query = 'SELECT 1 SETTINGS use_query_cache = true;'; @@ -20,11 +20,12 @@ WHERE type = 'QueryFinish' SELECT '---'; SELECT 1 SETTINGS use_query_cache = true; -DROP TABLE system.query_log SYNC; SYSTEM FLUSH LOGS; SELECT ProfileEvents['QueryCacheHits'], ProfileEvents['QueryCacheMisses'] FROM system.query_log WHERE type = 'QueryFinish' - AND query = 'SELECT 1 SETTINGS use_query_cache = true;'; + AND current_database = currentDatabase() + AND query = 'SELECT 1 SETTINGS use_query_cache = true;' +ORDER BY event_time_microseconds; SYSTEM DROP QUERY CACHE; diff --git a/tests/queries/0_stateless/02494_query_cache_normalize_ast.sql b/tests/queries/0_stateless/02494_query_cache_normalize_ast.sql index 5fd09eb935b..1dbb3ef8158 100644 --- a/tests/queries/0_stateless/02494_query_cache_normalize_ast.sql +++ b/tests/queries/0_stateless/02494_query_cache_normalize_ast.sql @@ -1,9 +1,8 @@ -- Tags: no-parallel -- Tag no-parallel: Messes with internal cache --- Start with empty query cache (QC) and query log. +-- Start with empty query cache (QC) SYSTEM DROP QUERY CACHE; -DROP TABLE system.query_log SYNC; -- Run query whose result gets cached in the query cache. -- Besides "use_query_cache", pass two more knobs (one QC-specific knob and one non-QC-specific knob). We just care @@ -24,6 +23,7 @@ SYSTEM FLUSH LOGS; SELECT ProfileEvents['QueryCacheHits'], ProfileEvents['QueryCacheMisses'] FROM system.query_log WHERE type = 'QueryFinish' + AND current_database = currentDatabase() AND query = 'SELECT 1 SETTINGS use_query_cache = true, enable_writes_to_query_cache = false, max_threads = 16;'; SYSTEM DROP QUERY CACHE; diff --git a/tests/queries/0_stateless/02494_query_cache_passive_usage.reference b/tests/queries/0_stateless/02494_query_cache_passive_usage.reference index edff09773d1..8b73647196e 100644 --- a/tests/queries/0_stateless/02494_query_cache_passive_usage.reference +++ b/tests/queries/0_stateless/02494_query_cache_passive_usage.reference @@ -9,4 +9,5 @@ ----- 1 1 +0 1 1 0 diff --git a/tests/queries/0_stateless/02494_query_cache_passive_usage.sql b/tests/queries/0_stateless/02494_query_cache_passive_usage.sql index 6143b5f7083..f0d2f6398a8 100644 --- a/tests/queries/0_stateless/02494_query_cache_passive_usage.sql +++ b/tests/queries/0_stateless/02494_query_cache_passive_usage.sql @@ -22,10 +22,7 @@ SELECT COUNT(*) FROM system.query_cache; SELECT '-----'; --- Run same query with passive mode again. There must still be one entry in the QC and we must have a QC hit. - --- Get rid of log of previous SELECT -DROP TABLE system.query_log SYNC; +/* Run same query with passive mode again. There must still be one entry in the QC and we must have a QC hit. */ SELECT 1 SETTINGS use_query_cache = true, enable_writes_to_query_cache = false; SELECT COUNT(*) FROM system.query_cache; @@ -34,6 +31,9 @@ SYSTEM FLUSH LOGS; SELECT ProfileEvents['QueryCacheHits'], ProfileEvents['QueryCacheMisses'] FROM system.query_log WHERE type = 'QueryFinish' - AND query = 'SELECT 1 SETTINGS use_query_cache = true, enable_writes_to_query_cache = false;'; + AND current_database = currentDatabase() + /* NOTE: client incorrectly join comments from the previous line into query, hence LIKE */ + AND query LIKE '%\nSELECT 1 SETTINGS use_query_cache = true, enable_writes_to_query_cache = false;' +ORDER BY event_time_microseconds; SYSTEM DROP QUERY CACHE; From 35825bc7d672974fc9219ca7e3608a32d0cd73bc Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 13 Jun 2023 16:48:28 +0200 Subject: [PATCH 0835/1072] Increase line-length limit for yamlllint CI reports [1]: /ClickHouse/tests/queries/0_stateless/data_ua_parser/browser.yaml 713:301 warning line too long (328 > 300 characters) (line-length) /ClickHouse/tests/queries/0_stateless/data_ua_parser/device.yaml 2606:301 warning line too long (529 > 300 characters) (line-length) 2616:301 warning line too long (348 > 300 characters) (line-length) 2630:301 warning line too long (377 > 300 characters) (line-length) 2637:301 warning line too long (447 > 300 characters) (line-length) [1]: https://s3.amazonaws.com/clickhouse-test-reports/50934/be4555c3226298d956ff650fab477d67bf73ba83/style_check/style_output.txt Signed-off-by: Azat Khuzhin --- .yamllint | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.yamllint b/.yamllint index fe161e71849..9d6550ac960 100644 --- a/.yamllint +++ b/.yamllint @@ -6,8 +6,10 @@ rules: level: warning indent-sequences: consistent line-length: - # there are some bash -c "", so this is OK - max: 300 + # there are: + # - bash -c "", so this is OK + # - yaml in tests + max: 1000 level: warning comments: min-spaces-from-content: 1 From 77775c6074601e49d44349f3d397229354571592 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 13 Jun 2023 16:49:45 +0200 Subject: [PATCH 0836/1072] Rename 02701_fail_on_invalid_having to 02701_invalid_having_NOT_AN_AGGREGATE To remove "fail" from the test name, which is prohibited Signed-off-by: Azat Khuzhin --- ....reference => 02701_invalid_having_NOT_AN_AGGREGATE.reference} | 0 ...valid_having.sql => 02701_invalid_having_NOT_AN_AGGREGATE.sql} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/queries/0_stateless/{02701_fail_on_invalid_having.reference => 02701_invalid_having_NOT_AN_AGGREGATE.reference} (100%) rename tests/queries/0_stateless/{02701_fail_on_invalid_having.sql => 02701_invalid_having_NOT_AN_AGGREGATE.sql} (100%) diff --git a/tests/queries/0_stateless/02701_fail_on_invalid_having.reference b/tests/queries/0_stateless/02701_invalid_having_NOT_AN_AGGREGATE.reference similarity index 100% rename from tests/queries/0_stateless/02701_fail_on_invalid_having.reference rename to tests/queries/0_stateless/02701_invalid_having_NOT_AN_AGGREGATE.reference diff --git a/tests/queries/0_stateless/02701_fail_on_invalid_having.sql b/tests/queries/0_stateless/02701_invalid_having_NOT_AN_AGGREGATE.sql similarity index 100% rename from tests/queries/0_stateless/02701_fail_on_invalid_having.sql rename to tests/queries/0_stateless/02701_invalid_having_NOT_AN_AGGREGATE.sql From 0444aa2fda48f59be08bef6482b06754f1cb2c0b Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 13 Jun 2023 16:50:43 +0200 Subject: [PATCH 0837/1072] tests: add missing current_database condition for query_log Signed-off-by: Azat Khuzhin --- tests/queries/0_stateless/02483_elapsed_time.sh | 2 +- .../queries/0_stateless/02499_monotonicity_toUnixTimestamp64.sh | 2 +- .../0_stateless/02521_incorrect_dealy_for_insert_bug_44902.sh | 2 +- .../queries/0_stateless/02578_parameterized_rename_queries.sql | 1 + tests/queries/0_stateless/02585_query_status_deadlock.sh | 2 +- tests/queries/0_stateless/02681_undrop_query.sql | 2 +- tests/queries/0_stateless/02761_ddl_initial_query_id.sh | 2 ++ .../02783_parallel_replicas_trivial_count_optimization.sh | 2 +- tests/queries/1_stateful/00177_memory_bound_merging.sh | 2 ++ 9 files changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/queries/0_stateless/02483_elapsed_time.sh b/tests/queries/0_stateless/02483_elapsed_time.sh index 608299eb01b..e3b983129fb 100755 --- a/tests/queries/0_stateless/02483_elapsed_time.sh +++ b/tests/queries/0_stateless/02483_elapsed_time.sh @@ -25,7 +25,7 @@ QUERY_ID="${CLICKHOUSE_DATABASE}_$(date +%s)_02883_q1" ${CLICKHOUSE_CLIENT} -m --query "$EXCEPTION_BEFORE_START_QUERY" --query_id="$QUERY_ID" >/dev/null 2>&1 ${CLICKHOUSE_CLIENT} --query "SYSTEM FLUSH LOGS" -${CLICKHOUSE_CLIENT} --query "SELECT type == 'ExceptionBeforeStart' as expected_type, query_duration_ms <= 1000 as elapsed_more_than_one_second FROM system.query_log WHERE query_id='$QUERY_ID'" +${CLICKHOUSE_CLIENT} --query "SELECT type == 'ExceptionBeforeStart' as expected_type, query_duration_ms <= 1000 as elapsed_more_than_one_second FROM system.query_log WHERE current_database = '$CLICKHOUSE_DATABASE' AND query_id='$QUERY_ID'" # Now we test with a query that will take 1+ seconds. The CLI should show that as part of the output format OK_QUERY_JSON=" diff --git a/tests/queries/0_stateless/02499_monotonicity_toUnixTimestamp64.sh b/tests/queries/0_stateless/02499_monotonicity_toUnixTimestamp64.sh index 1223d7957b5..5d787aa0d8e 100755 --- a/tests/queries/0_stateless/02499_monotonicity_toUnixTimestamp64.sh +++ b/tests/queries/0_stateless/02499_monotonicity_toUnixTimestamp64.sh @@ -18,5 +18,5 @@ query_id="${CLICKHOUSE_DATABASE}_02499_$RANDOM$RANDOM" $CLICKHOUSE_CLIENT --query_id="$query_id" -q "select ts from t order by toUnixTimestamp64Nano(ts) limit 10 format Null settings max_block_size = $max_block_size, optimize_read_in_order = 1;" $CLICKHOUSE_CLIENT -q "system flush logs;" -$CLICKHOUSE_CLIENT --param_query_id="$query_id" -q "select read_rows <= $max_block_size from system.query_log where event_date >= yesterday() and query_id = {query_id:String} and type = 'QueryFinish';" +$CLICKHOUSE_CLIENT --param_query_id="$query_id" -q "select read_rows <= $max_block_size from system.query_log where event_date >= yesterday() and current_database = '$CLICKHOUSE_DATABASE' and query_id = {query_id:String} and type = 'QueryFinish';" diff --git a/tests/queries/0_stateless/02521_incorrect_dealy_for_insert_bug_44902.sh b/tests/queries/0_stateless/02521_incorrect_dealy_for_insert_bug_44902.sh index 5f91ef19a5a..0ae44ec0c01 100755 --- a/tests/queries/0_stateless/02521_incorrect_dealy_for_insert_bug_44902.sh +++ b/tests/queries/0_stateless/02521_incorrect_dealy_for_insert_bug_44902.sh @@ -16,7 +16,7 @@ do query_id="${CLICKHOUSE_DATABASE}_02521_${i}_$RANDOM$RANDOM" $CLICKHOUSE_CLIENT --query_id="$query_id" -q "INSERT INTO test_02521_insert_delay SELECT number, toString(number) FROM numbers(${i}, 1)" $CLICKHOUSE_CLIENT -q "SYSTEM FLUSH LOGS" - $CLICKHOUSE_CLIENT --param_query_id="$query_id" -q "select ProfileEvents['DelayedInsertsMilliseconds'] as delay from system.query_log where event_date >= yesterday() and query_id = {query_id:String} order by delay desc limit 1" + $CLICKHOUSE_CLIENT --param_query_id="$query_id" -q "select ProfileEvents['DelayedInsertsMilliseconds'] as delay from system.query_log where event_date >= yesterday() and current_database = '$CLICKHOUSE_DATABASE' and query_id = {query_id:String} order by delay desc limit 1" done $CLICKHOUSE_CLIENT -q "INSERT INTO test_02521_insert_delay VALUES(0, 'This query throws error')" 2>&1 | grep -o 'TOO_MANY_PARTS' | head -n 1 diff --git a/tests/queries/0_stateless/02578_parameterized_rename_queries.sql b/tests/queries/0_stateless/02578_parameterized_rename_queries.sql index eecb282083f..de36f8ae3b5 100644 --- a/tests/queries/0_stateless/02578_parameterized_rename_queries.sql +++ b/tests/queries/0_stateless/02578_parameterized_rename_queries.sql @@ -24,6 +24,7 @@ SET param_new_tbl_name = 02661_t1; CREATE TABLE {new_db_name:Identifier}.{old_tbl_name:Identifier} (a UInt64) ENGINE = MergeTree ORDER BY tuple(); RENAME TABLE {new_db_name:Identifier}.{old_tbl_name:Identifier} TO {new_db_name:Identifier}.{new_tbl_name:Identifier}; +-- NOTE: no 'database = currentDatabase()' on purpose SELECT name FROM system.tables WHERE name = {new_tbl_name:String}; -- Case 3: RENAME DICTIONARY diff --git a/tests/queries/0_stateless/02585_query_status_deadlock.sh b/tests/queries/0_stateless/02585_query_status_deadlock.sh index 227ecb1c1b2..9eb6eff8cd0 100755 --- a/tests/queries/0_stateless/02585_query_status_deadlock.sh +++ b/tests/queries/0_stateless/02585_query_status_deadlock.sh @@ -14,7 +14,7 @@ $CLICKHOUSE_CLIENT -q "SYSTEM FLUSH LOGS" while true do - res=$($CLICKHOUSE_CLIENT -q "select query, event_time from system.query_log where query_id = '$QUERY_ID' and query like 'select%' limit 1") + res=$($CLICKHOUSE_CLIENT -q "select query, event_time from system.query_log where query_id = '$QUERY_ID' and current_database = '$CLICKHOUSE_DATABASE' and query like 'select%' limit 1") if [ -n "$res" ]; then break fi diff --git a/tests/queries/0_stateless/02681_undrop_query.sql b/tests/queries/0_stateless/02681_undrop_query.sql index ead1a8bb305..39ca1548d53 100644 --- a/tests/queries/0_stateless/02681_undrop_query.sql +++ b/tests/queries/0_stateless/02681_undrop_query.sql @@ -21,7 +21,7 @@ detach table 02681_undrop_detach; undrop table 02681_undrop_detach; -- { serverError 57 } attach table 02681_undrop_detach; alter table 02681_undrop_detach update num = 2 where id = 1; -select command from system.mutations where table='02681_undrop_detach' limit 1; +select command from system.mutations where table='02681_undrop_detach' and database=currentDatabase() limit 1; drop table 02681_undrop_detach sync; select 'test MergeTree with cluster'; diff --git a/tests/queries/0_stateless/02761_ddl_initial_query_id.sh b/tests/queries/0_stateless/02761_ddl_initial_query_id.sh index e9a315b812b..b8b35ef01f7 100755 --- a/tests/queries/0_stateless/02761_ddl_initial_query_id.sh +++ b/tests/queries/0_stateless/02761_ddl_initial_query_id.sh @@ -21,4 +21,6 @@ $CLICKHOUSE_CLIENT -q "SYSTEM FLUSH LOGS" # - replace() is required to avoid non deterministic behaviour of # normalizeQuery() that replaces the identifier with "?" only if it has more # then two numbers. +# +# NOTE: no current_database = '$CLICKHOUSE_DATABASE' filter on purpose (since ON CLUSTER queries does not have current_database passed) $CLICKHOUSE_CLIENT -q "SELECT normalizeQuery(replace(query, currentDatabase(), 'default')) FROM system.query_log WHERE initial_query_id = '$query_id' AND type != 'QueryStart' ORDER BY event_time_microseconds" diff --git a/tests/queries/0_stateless/02783_parallel_replicas_trivial_count_optimization.sh b/tests/queries/0_stateless/02783_parallel_replicas_trivial_count_optimization.sh index 4c29e513183..6210ef2e8b6 100755 --- a/tests/queries/0_stateless/02783_parallel_replicas_trivial_count_optimization.sh +++ b/tests/queries/0_stateless/02783_parallel_replicas_trivial_count_optimization.sh @@ -12,7 +12,7 @@ function has_used_parallel_replicas () { sumIf(read_rows, is_initial_query) as read_rows, sumIf(read_bytes, is_initial_query) as read_bytes FROM system.query_log - WHERE event_date >= yesterday() and initial_query_id LIKE '$1%' + WHERE event_date >= yesterday() and initial_query_id LIKE '$1%' AND current_database = '$CLICKHOUSE_DATABASE' GROUP BY initial_query_id ORDER BY min(event_time_microseconds) ASC FORMAT TSV" diff --git a/tests/queries/1_stateful/00177_memory_bound_merging.sh b/tests/queries/1_stateful/00177_memory_bound_merging.sh index 008422be108..774f005b8eb 100755 --- a/tests/queries/1_stateful/00177_memory_bound_merging.sh +++ b/tests/queries/1_stateful/00177_memory_bound_merging.sh @@ -12,6 +12,8 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) check_replicas_read_in_order() { # to check this we actually look for at least one log message from MergeTreeInOrderSelectProcessor. # hopefully logger's names are a bit more stable than log messages itself + # + # NOTE: lack of "current_database = '$CLICKHOUSE_DATABASE'" filter is made on purpose $CLICKHOUSE_CLIENT -nq " SYSTEM FLUSH LOGS; From bb971fd7b7fd5bc400bbe28d16867c9dc337fb17 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 13 Jun 2023 17:02:24 +0200 Subject: [PATCH 0838/1072] check-style: allow {database} for ReplicatedMergeTree as well CLICKHOUSE_TEST_ZOOKEEPER_PREFIX is a {test_name}_{database}, but actually {database} should be enough, since it is uniq for each test run. Signed-off-by: Azat Khuzhin --- utils/check-style/check-style | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/check-style/check-style b/utils/check-style/check-style index bd3ee8e02d6..e7c06fefee2 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -229,7 +229,7 @@ for test_case in "${tests_with_replicated_merge_tree[@]}"; do *.gen.*) ;; *.sh) - test_case_zk_prefix="\$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX" + test_case_zk_prefix="\(\$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX\|{database}\)" grep -q -e "Replicated.*MergeTree[ ]*(.*$test_case_zk_prefix" "$test_case" || echo "Replicated.*MergeTree should contain '$test_case_zk_prefix' in zookeeper path to avoid overlaps ($test_case)" ;; *.sql|*.sql.j2) From 1bc5598aa77bb5a7dcffc26b090bb0f45cb83abb Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Tue, 13 Jun 2023 18:02:25 +0200 Subject: [PATCH 0839/1072] impl --- src/Disks/IO/AsynchronousBoundedReadBuffer.cpp | 5 ++--- src/Disks/IO/ReadBufferFromRemoteFSGather.h | 2 ++ src/IO/SeekableReadBuffer.h | 4 ++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp b/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp index d75ec9f09e0..f9bd68222ae 100644 --- a/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp +++ b/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp @@ -301,9 +301,8 @@ off_t AsynchronousBoundedReadBuffer::seek(off_t offset, int whence) * Lazy ignore. Save number of bytes to ignore and ignore it either for prefetch buffer or current buffer. * Note: we read in range [file_offset_of_buffer_end, read_until_position). */ - if (file_offset_of_buffer_end && read_until_position && new_pos < *read_until_position - && new_pos > file_offset_of_buffer_end - && new_pos < file_offset_of_buffer_end + read_settings.remote_read_min_bytes_for_seek) + if (!impl->seekIsCheap() && file_offset_of_buffer_end && read_until_position && new_pos < *read_until_position + && new_pos > file_offset_of_buffer_end && new_pos < file_offset_of_buffer_end + read_settings.remote_read_min_bytes_for_seek) { ProfileEvents::increment(ProfileEvents::RemoteFSLazySeeks); bytes_to_ignore = new_pos - file_offset_of_buffer_end; diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.h b/src/Disks/IO/ReadBufferFromRemoteFSGather.h index cb98ac6d9f3..272ed2b3ac1 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.h +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.h @@ -50,6 +50,8 @@ public: off_t getPosition() override { return file_offset_of_buffer_end - available() + bytes_to_ignore; } + bool seekIsCheap() override { return !current_buf; } + private: SeekableReadBufferPtr createImplementationBuffer(const StoredObject & object); diff --git a/src/IO/SeekableReadBuffer.h b/src/IO/SeekableReadBuffer.h index 8ced9d752de..5770948be20 100644 --- a/src/IO/SeekableReadBuffer.h +++ b/src/IO/SeekableReadBuffer.h @@ -83,6 +83,10 @@ public: /// Checks if readBigAt() is allowed. May be slow, may throw (e.g. it may do an HTTP request or an fstat). virtual bool supportsReadAt() { return false; } + + /// We do some tricks to avoid seek cost. E.g we read more data and than ignore it (see remote_read_min_bytes_for_seek). + /// Sometimes however seek is basically free because underlying read buffer wasn't yet initialised (or re-initialised after reset). + virtual bool seekIsCheap() { return false; } }; From 994228ab209d3dae408c43e135e2391e4a6b5112 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 13 Jun 2023 20:54:02 +0000 Subject: [PATCH 0840/1072] Uncomment flaky test --- .../0_stateless/01746_convert_type_with_default.reference | 1 + tests/queries/0_stateless/01746_convert_type_with_default.sql | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.reference b/tests/queries/0_stateless/01746_convert_type_with_default.reference index 0edea4de31e..e00156cd3c5 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.reference +++ b/tests/queries/0_stateless/01746_convert_type_with_default.reference @@ -40,6 +40,7 @@ 1970-01-20 1970-01-20 2149-06-06 +1 2023-05-30 2023-05-30 2023-05-30 14:38:20 diff --git a/tests/queries/0_stateless/01746_convert_type_with_default.sql b/tests/queries/0_stateless/01746_convert_type_with_default.sql index c74b185f7fd..5ef7718784d 100644 --- a/tests/queries/0_stateless/01746_convert_type_with_default.sql +++ b/tests/queries/0_stateless/01746_convert_type_with_default.sql @@ -54,7 +54,7 @@ select toDateOrDefault(cast(19 as Int256)); select toDateOrDefault(cast(19 as UInt256)); select toDateOrDefault(65535); --- select toDateOrDefault(122400); +select toDateOrDefault(65536) in ('1970-01-01', '1970-01-02'); select toDateOrDefault(19507, '2000-01-01'::Date); select toDateOrDefault(-1, '2023-05-30'::Date); From a570b00bdf76118e53de34a3337a66683b55fcb7 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 13 Jun 2023 19:19:34 -0300 Subject: [PATCH 0841/1072] Update README.md --- docker/server/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/server/README.md b/docker/server/README.md index 67646a262f5..6aec001064e 100644 --- a/docker/server/README.md +++ b/docker/server/README.md @@ -20,6 +20,7 @@ For more information and documentation see https://clickhouse.com/. - The amd64 image requires support for [SSE3 instructions](https://en.wikipedia.org/wiki/SSE3). Virtually all x86 CPUs after 2005 support SSE3. - The arm64 image requires support for the [ARMv8.2-A architecture](https://en.wikipedia.org/wiki/AArch64#ARMv8.2-A). Most ARM CPUs after 2017 support ARMv8.2-A. A notable exception is Raspberry Pi 4 from 2019 whose CPU only supports ARMv8.0-A. +- Since Clickhouse 23.3 Ubuntu image started to use `ubuntu:22.04` as a base image it requiers docker version >= `20.10.10`, overwise use `docker run --privileged`. ## How to use this image From d8d570081031f4851c33737618d907d69a3b14e8 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 13 Jun 2023 19:24:36 -0300 Subject: [PATCH 0842/1072] Update README.md --- docker/server/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/server/README.md b/docker/server/README.md index 6aec001064e..0c89f834fcd 100644 --- a/docker/server/README.md +++ b/docker/server/README.md @@ -20,7 +20,7 @@ For more information and documentation see https://clickhouse.com/. - The amd64 image requires support for [SSE3 instructions](https://en.wikipedia.org/wiki/SSE3). Virtually all x86 CPUs after 2005 support SSE3. - The arm64 image requires support for the [ARMv8.2-A architecture](https://en.wikipedia.org/wiki/AArch64#ARMv8.2-A). Most ARM CPUs after 2017 support ARMv8.2-A. A notable exception is Raspberry Pi 4 from 2019 whose CPU only supports ARMv8.0-A. -- Since Clickhouse 23.3 Ubuntu image started to use `ubuntu:22.04` as a base image it requiers docker version >= `20.10.10`, overwise use `docker run --privileged`. +- Since Clickhouse 23.3 Ubuntu image started to use `ubuntu:22.04` as a base image it requiers docker version >= `20.10.10`, overwise use `docker run --privileged`. Alternativly try Clickhouse Alpine image. ## How to use this image From 404a52432c59a4b833d093e2e344916f4cb62de5 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 13 Jun 2023 19:39:23 -0300 Subject: [PATCH 0843/1072] Update README.md --- docker/server/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/server/README.md b/docker/server/README.md index 0c89f834fcd..18dce492123 100644 --- a/docker/server/README.md +++ b/docker/server/README.md @@ -20,7 +20,7 @@ For more information and documentation see https://clickhouse.com/. - The amd64 image requires support for [SSE3 instructions](https://en.wikipedia.org/wiki/SSE3). Virtually all x86 CPUs after 2005 support SSE3. - The arm64 image requires support for the [ARMv8.2-A architecture](https://en.wikipedia.org/wiki/AArch64#ARMv8.2-A). Most ARM CPUs after 2017 support ARMv8.2-A. A notable exception is Raspberry Pi 4 from 2019 whose CPU only supports ARMv8.0-A. -- Since Clickhouse 23.3 Ubuntu image started to use `ubuntu:22.04` as a base image it requiers docker version >= `20.10.10`, overwise use `docker run --privileged`. Alternativly try Clickhouse Alpine image. +- Since the Clickhouse 23.3 Ubuntu image started using `ubuntu:22.04` as its base image, it requires docker version >= `20.10.10`, or use `docker run -- privileged` instead. Alternatively, try the Clickhouse Alpine image. ## How to use this image From 86075dbae49f8acda2819fc6b8640631ccf1f3fe Mon Sep 17 00:00:00 2001 From: Rich Raposa Date: Tue, 13 Jun 2023 16:40:14 -0600 Subject: [PATCH 0844/1072] Update azureBlobStorage.md --- docs/en/engines/table-engines/integrations/azureBlobStorage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/engines/table-engines/integrations/azureBlobStorage.md b/docs/en/engines/table-engines/integrations/azureBlobStorage.md index b8e621fd513..14fbf0c068e 100644 --- a/docs/en/engines/table-engines/integrations/azureBlobStorage.md +++ b/docs/en/engines/table-engines/integrations/azureBlobStorage.md @@ -48,4 +48,4 @@ SELECT * FROM test_table; ## See also -[Azure Blob Storage Table Function](/docs/en/sql-reference/table-functions/azureBlobStorage.md) +[Azure Blob Storage Table Function](/docs/en/sql-reference/table-functions/azureBlobStorage) From 49082bfe8919a4f7113e96eccde94c4fa2d74017 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Wed, 14 Jun 2023 09:00:50 +0800 Subject: [PATCH 0845/1072] fix typos in redis.md --- docs/en/engines/table-engines/integrations/redis.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/redis.md b/docs/en/engines/table-engines/integrations/redis.md index 68235a89d33..a78942ab7bb 100644 --- a/docs/en/engines/table-engines/integrations/redis.md +++ b/docs/en/engines/table-engines/integrations/redis.md @@ -114,6 +114,6 @@ TRUNCATE TABLE redis_table SYNC; ## Limitations {#limitations} -Redis engine also support scanning query, such as `where k > xx`, but it has some limitations: -1. Scanning query may produce some duplicated keys in a very rare case when it is rehashing, details see [Redis Scan](https://github.com/redis/redis/blob/e4d183afd33e0b2e6e8d1c79a832f678a04a7886/src/dict.c#L1186-L1269) -2. During the scanning keys could be created and deleted, so the resulting dataset can not represent a valid point in time. +Redis engine also supports scanning queries, such as `where k > xx`, but it has some limitations: +1. Scanning query may produce some duplicated keys in a very rare case when it is rehashing. See details in [Redis Scan](https://github.com/redis/redis/blob/e4d183afd33e0b2e6e8d1c79a832f678a04a7886/src/dict.c#L1186-L1269) +2. During the scanning, keys could be created and deleted, so the resulting dataset can not represent a valid point in time. From c05bcf56058597e180bfb65532e76dbe6c1639da Mon Sep 17 00:00:00 2001 From: pufit Date: Tue, 13 Jun 2023 21:09:30 -0400 Subject: [PATCH 0846/1072] Fix keeper-client help message --- programs/keeper-client/KeeperClient.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index f38da1b72aa..f41dca1e27a 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -127,42 +127,42 @@ void KeeperClient::defineOptions(Poco::Util::OptionSet & options) options.addOption( Poco::Util::Option("host", "h", "server hostname. default `localhost`") - .argument("host") + .argument("") .binding("host")); options.addOption( Poco::Util::Option("port", "p", "server port. default `2181`") - .argument("port") + .argument("") .binding("port")); options.addOption( Poco::Util::Option("query", "q", "will execute given query, then exit.") - .argument("query") + .argument("") .binding("query")); options.addOption( Poco::Util::Option("connection-timeout", "", "set connection timeout in seconds. default 10s.") - .argument("connection-timeout") + .argument("") .binding("connection-timeout")); options.addOption( Poco::Util::Option("session-timeout", "", "set session timeout in seconds. default 10s.") - .argument("session-timeout") + .argument("") .binding("session-timeout")); options.addOption( Poco::Util::Option("operation-timeout", "", "set operation timeout in seconds. default 10s.") - .argument("operation-timeout") + .argument("") .binding("operation-timeout")); options.addOption( Poco::Util::Option("history-file", "", "set path of history file. default `~/.keeper-client-history`") - .argument("history-file") + .argument("") .binding("history-file")); options.addOption( Poco::Util::Option("log-level", "", "set log level") - .argument("log-level") + .argument("") .binding("log-level")); } From f1b5d47ce27c053e76722d7827c49fb4aa78ac0e Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Wed, 14 Jun 2023 01:15:45 +0000 Subject: [PATCH 0847/1072] corrections after second review iteration --- docs/en/interfaces/cli.md | 37 ++++++++----------- docs/ru/interfaces/cli.md | 32 +++++++--------- programs/client/Client.cpp | 2 +- src/Client/ConnectionString.cpp | 31 ++++++++-------- src/Client/ConnectionString.h | 12 +++--- .../0_stateless/02784_connection_string.sh | 6 +-- 6 files changed, 54 insertions(+), 66 deletions(-) diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index b5134ea30c0..e2c7dc1e608 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -194,7 +194,7 @@ You can pass parameters to `clickhouse-client` (all parameters have a default va - `--print-profile-events` – Print `ProfileEvents` packets. - `--profile-events-delay-ms` – Delay between printing `ProfileEvents` packets (-1 - print only totals, 0 - print every single packet). -Instead of --host, --port, --user and --password options, ClickHouse client also supports connection strings. +Instead of `--host`, `--port`, `--user` and `--password` options, ClickHouse client also supports connection strings (see next section). ## Connection string {#connection_string} @@ -213,28 +213,27 @@ Where - `database` - (optional) is the database name, - `query_parameters` - (optional) is a list of key-value pairs `param1=value1[,¶m2=value2], ...`. For some parameters, no value is required. Parameter names and values are case-sensitive. +If no user is specified, `default` user without password will be used. +If no host is specified, the `localhost` will be used (localhost). +If no port is specified is not specified, `9000` will be used as port. +If no database is specified, the `default` database will be used. +If the user name, password or database was specified in the connection string, it cannot be specified using `--user`, `--password` or `--database` (and vice versa). -The host component can either be an IP address or a host name. Put an IPv6 address in square brackets to specify it: +The host component can either be an a host name and IP address. Put an IPv6 address in square brackets to specify it: ```text clickhouse://[2001:db8::1234] ``` -If user is not specified, `default` user without password will be used. -If host is not specified, the `localhost` will be used (localhost). -If port is not specified, `9000` will be used as port. -If database is not specified, the `default` database will be used. - -If the user name, password or database was specified in the connection string, it cannot be specified using `--user`, `--password` or `--database` (and vice versa). - -The connection string must be specified in the first argument of clickhouse-client. The connection string can be combined with other [command-line-options](#command-line-options) except `--host(h)` and `--port`. - -### Multiple hosts {#connection_string_multiple_hosts} - URI allows multiple hosts to be connected to. Connection strings can contain multiple hosts. ClickHouse-client will try to connect to these hosts in order (i.e. from left to right). After the connection is established, no attempt to connect to the remaining hosts is made. -### Allowed query_parameters keys {#connection_string_query_parameters} + + + +The connection string must be specified as the first argument of clickhouse-client. The connection string can be combined with arbitrary other [command-line-options](#command-line-options) except `--host/-h` and `--port`. + +The following keys are allowed for component `query_parameter`: - `secure` or shorthanded `s` - no value. If specified, client will connect to the server over a secure connection (TLS). See `secure` in [command-line-options](#command-line-options) @@ -244,7 +243,7 @@ Non-US ASCII, spaces and special characters in the `user`, `password`, `hosts`, ### Examples {#connection_string_examples} -Connect to localhost using port 9000 and execute the query "SELECT 1". +Connect to localhost using port 9000 and execute the query `SELECT 1`. ``` bash clickhouse-client clickhouse://localhost:9000 --query "SELECT 1" @@ -262,12 +261,6 @@ Connect to localhost using default user, host with IPV6 address `[::1]` and port clickhouse-client clickhouse://[::1]:9000 ``` -Connect to localhost using default user, host with IPV6 address `[2001:db8:3333:4444:5555:6666:7777:8888]` and port `9000`. - -``` bash -clickhouse-client clickhouse://[2001:db8:3333:4444:5555:6666:7777:8888]:9000 -``` - Connect to localhost using port 9000 in multiline mode. ``` bash @@ -277,7 +270,7 @@ clickhouse-client clickhouse://localhost:9000 '-m' Connect to localhost using port 9000 with the user `default`. ``` bash -clickhouse-client clickhouse://default@localhost:9000 --user default +clickhouse-client clickhouse://default@localhost:9000 # equivalent to: clickhouse-client clickhouse://localhost:9000 --user default diff --git a/docs/ru/interfaces/cli.md b/docs/ru/interfaces/cli.md index 794ac60ec83..aa6ae3629e8 100644 --- a/docs/ru/interfaces/cli.md +++ b/docs/ru/interfaces/cli.md @@ -142,6 +142,8 @@ $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="numbe - `--history_file` - путь к файлу с историей команд. - `--param_` — значение параметра для [запроса с параметрами](#cli-queries-with-parameters). +Вместо параметров `--host`, `--port`, `--user` и `--password` клиент ClickHouse также поддерживает строки подключения (смотри следующий раздел). + ## Строка подключения {#connection_string} clickhouse-client также поддерживает подключение к серверу clickhouse с помощью строки подключения, аналогичной [MongoDB](https://www.mongodb.com/docs/manual/reference/connection-string/), [PostgreSQL](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING), [MySQL](https://dev.mysql.com/doc/refman/8.0/en/connecting-using-uri-or-key-value-pairs.html#connecting-using-uri). Она имеет следующий синтаксис: @@ -154,15 +156,9 @@ clickhouse:[//[user[:password]@][hosts_and_ports]][/database][?query_parameters] - `user` - (необязательно) - это имя пользователя, - `password` - (необязательно) - Пароль пользователя. Если символ `:` укаказан, и пароль пуст, то клиент запросит ввести пользователя пароль. -- `hostspec` - (необязательно) - список хостов и необязательных портов. `host[:port] [, host:[port]], ...`, +- `hosts_and_ports` - (необязательно) - список хостов и необязательных портов. `host[:port] [, host:[port]], ...`, - `database` - (необязательно) - это имя базы данных, -- `paramspec` - (опционально) список пар ключ-значение `param1=value1[,¶m2=value2], ...`. Для некоторых параметров значение не требуется. Имена и значения параметров чувствительны к регистру. - -Параметр host может быть либо IP-адресом, либо именем хоста. Для указания IPv6-адреса поместите его в квадратные скобки: - -```text -clickhouse://[2001:db8::1234] -``` +- `query_parameters` - (опционально) список пар ключ-значение `param1=value1[,¶m2=value2], ...`. Для некоторых параметров значение не требуется. Имена и значения параметров чувствительны к регистру. Если user не указан, будут использоваться имя пользователя `default`. Если host не указан, будет использован хост `localhost`. @@ -171,13 +167,19 @@ clickhouse://[2001:db8::1234] Если имя пользователя, пароль или база данных были указаны в строке подключения, их нельзя указать с помощью `--user`, `--password` или `--database` (и наоборот). -Строка подключения должна быть указана в первом аргументе clickhouse-client. Строка подключения может комбинироваться с другими [параметрами командной строки] (#command-line-options) кроме `--host (h)` и `--port`. +Параметр host может быть либо именем хоста, либо IP-адресом. Для указания IPv6-адреса поместите его в квадратные скобки: -### Несколько хостов {#connection_string_multiple_hosts} +```text +clickhouse://[2001:db8::1234] +``` URI позволяет подключаться к нескольким хостам. Строки подключения могут содержать несколько хостов. ClickHouse-client будет пытаться подключиться к этим хостам по порядку (т.е. слева направо). После установления соединения попытки подключения к оставшимся хостам не предпринимаются. -### Допустимые ключи query_parameters {#connection_string_query_parameters} + + +Строка подключения должна быть указана в первом аргументе clickhouse-client. Строка подключения может комбинироваться с другими [параметрами командной строки] (#command-line-options) кроме `--host/-h` и `--port`. + +Для компонента `query_parameter` разрешены следующие ключи: - `secure` или сокращенно `s` - без значение. Если параметр указан, то соединение с сервером будет осуществляться по защищенному каналу (TLS). См. `secure` в [command-line-options](#command-line-options). @@ -187,7 +189,7 @@ URI позволяет подключаться к нескольким хост ### Примеры {#connection_string_examples} -Подключиться к localhost через порт 9000 и выполнить запрос "SELECT 1" +Подключиться к localhost через порт 9000 и выполнить запрос `SELECT 1` ``` bash clickhouse-client clickhouse://localhost:9000 --query "SELECT 1" @@ -204,12 +206,6 @@ clickhouse-client clickhouse://john:secret@127.0.0.1:9000 clickhouse-client clickhouse://[::1]:9000 ``` -Подключиться к localhost, используя пользователя по умолчанию, хост с IPV6 адресом `[2001:db8:3333:4444:5555:6666:7777:8888]` и портом `9000`. - -`` bash -clickhouse-client clickhouse://[2001:db8:3333:4444:5555:6666:7777:8888]:9000 -``` - Подключиться к localhost через порт 9000 многострочном режиме. ``` bash diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index a49447dff69..6c3df3520e9 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -1268,7 +1268,7 @@ void Client::readArguments( std::string_view arg = argv[arg_num]; if (has_connection_string) - validateConnectionStringClientOption(arg); + checkIfCmdLineOptionCanBeUsedWithConnectionString(arg); if (arg == "--external") { diff --git a/src/Client/ConnectionString.cpp b/src/Client/ConnectionString.cpp index 8f0a0980f51..62090487490 100644 --- a/src/Client/ConnectionString.cpp +++ b/src/Client/ConnectionString.cpp @@ -45,26 +45,26 @@ std::string uriDecode(const std::string & uri_encoded_string, bool plus_as_space void getHostAndPort(const Poco::URI & uri, std::vector> & hosts_and_ports_arguments) { std::vector host_and_port; - auto host = uri.getHost(); + const auto& host = uri.getHost(); if (!host.empty()) { - host_and_port.push_back("--host="s + uriDecode(host, false)); + host_and_port.push_back("--host=" + uriDecode(host, false)); } // Port can be written without host (":9000"). Empty host name equals to default host. auto port = uri.getPort(); if (port != 0) - host_and_port.push_back("--port="s + std::to_string(port)); + host_and_port.push_back("--port=" + std::to_string(port)); if (!host_and_port.empty()) hosts_and_ports_arguments.push_back(std::move(host_and_port)); } void buildConnectionString( - Poco::URI & uri, - std::vector> & hosts_and_ports_arguments, std::string_view host_and_port, - std::string_view right_part) + std::string_view right_part, + Poco::URI & uri, + std::vector> & hosts_and_ports_arguments) { // User info does not matter in sub URI auto uri_string = std::string(CONNECTION_URI_SCHEME); @@ -154,7 +154,7 @@ bool tryParseConnectionString( { if (*it == ',') { - buildConnectionString(uri, hosts_and_ports_arguments, {last_host_begin, it}, {hosts_end, connection_string.end()}); + buildConnectionString({last_host_begin, it}, {hosts_end, connection_string.end()}, uri, hosts_and_ports_arguments); last_host_begin = it + 1; } } @@ -166,7 +166,7 @@ bool tryParseConnectionString( getHostAndPort(uri, hosts_and_ports_arguments); } else - buildConnectionString(uri, hosts_and_ports_arguments, {last_host_begin, hosts_end}, {hosts_end, connection_string.end()}); + buildConnectionString({last_host_begin, hosts_end}, {hosts_end, connection_string.end()}, uri, hosts_and_ports_arguments); Poco::URI::QueryParameters params = uri.getQueryParameters(); for (const auto & param : params) @@ -174,12 +174,12 @@ bool tryParseConnectionString( if (param.first == "secure" || param.first == "s") { if (!param.second.empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "secure URI query parameter does not require value"); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "secure URI query parameter does not allow value"); common_arguments.push_back(makeArgument(param.first)); } else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "URI query parameter {} is unknown", param.first); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "URI query parameter {} is not supported", param.first); } auto user_info = uri.getUserInfo(); @@ -188,7 +188,7 @@ bool tryParseConnectionString( // Poco::URI doesn't decode user name/password by default. // But ClickHouse allows to have users with email user name like: 'john@some_mail.com' // john@some_mail.com should be percent-encoded: 'john%40some_mail.com' - std::string::size_type pos = user_info.find(':'); + size_t pos = user_info.find(':'); if (pos != std::string::npos) { common_arguments.push_back("--user"); @@ -229,12 +229,11 @@ bool tryParseConnectionString( return true; } -void validateConnectionStringClientOption(std::string_view command_line_option) +void checkIfCmdLineOptionCanBeUsedWithConnectionString(std::string_view command_line_option) { - const auto prohibited_option_iter = PROHIBITED_CLIENT_OPTIONS.find(command_line_option); - if (prohibited_option_iter != PROHIBITED_CLIENT_OPTIONS.end()) - throw Exception( - ErrorCodes::BAD_ARGUMENTS, "Mixing a connection string and {} option is prohibited", prohibited_option_iter->second); + if (PROHIBITED_CLIENT_OPTIONS.contains(command_line_option)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Mixing a connection string and {} option is prohibited", PROHIBITED_CLIENT_OPTIONS.at(command_line_option)); } } diff --git a/src/Client/ConnectionString.h b/src/Client/ConnectionString.h index ce72de9edf6..ad63e9cda3d 100644 --- a/src/Client/ConnectionString.h +++ b/src/Client/ConnectionString.h @@ -9,19 +9,19 @@ namespace DB /** Tries to parse ClickHouse connection string. * if @connection_string starts with 'clickhouse:' then connection string will be parsed * and converted into a set of arguments for the client. - * Connection string format is similar to URI "clickhouse:[//[user_info@][hosts_and_ports]][/dbname][?query_parameters]" + * Connection string format is similar to URI "clickhouse:[//[user[:password]@][hosts_and_ports]][/dbname][?query_parameters]" * with the difference that hosts_and_ports can contain multiple hosts separated by ','. * example: clickhouse://user@host1:port1,host2:port2 - * @return returns true if there is a URI, false otherwise. - * @exception throws DB::Exception if URI has valid scheme (clickhouse:), but invalid internals. + * @return Returns false if no connection string was specified. If a connection string was specified, returns true if it is valid, and throws an exception if it is invalid. + * @exception Throws DB::Exception if URI has valid scheme (clickhouse:), but invalid internals. */ bool tryParseConnectionString( std::string_view connection_string, std::vector & common_arguments, std::vector> & hosts_and_ports_arguments); -// throws DB::Exception with BAD_ARGUMENTS if the given command line argument is allowed -// to be used with the connection string -void validateConnectionStringClientOption(std::string_view command_line_option); +// Throws DB::Exception with BAD_ARGUMENTS if the given command line argument +// is not allowed to be used with a connection string. +void checkIfCmdLineOptionCanBeUsedWithConnectionString(std::string_view command_line_option); } diff --git a/tests/queries/0_stateless/02784_connection_string.sh b/tests/queries/0_stateless/02784_connection_string.sh index 042f5b2108d..8353ac5b1e4 100755 --- a/tests/queries/0_stateless/02784_connection_string.sh +++ b/tests/queries/0_stateless/02784_connection_string.sh @@ -99,7 +99,6 @@ runClient "click_house:" 2>&1 | grep -o 'BAD_ARGUMENTS' TEST_INDEX=1000087 # Using connection string prohibits to use --host and --port options runClient "clickhouse://default:@$CLICKHOUSE_HOST/" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' -runClient "clickhouse://default:@$CLICKHOUSE_HOST/" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' runClient "clickhouse://default:@$CLICKHOUSE_HOST:$CLICKHOUSE_PORT_TCP/" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' runClient "clickhouse://default:@$CLICKHOUSE_HOST:$CLICKHOUSE_PORT_TCP/" --port "$CLICKHOUSE_PORT_TCP" 2>&1 | grep -o 'BAD_ARGUMENTS' runClient "clickhouse://default:@$CLICKHOUSE_HOST:$CLICKHOUSE_PORT_TCP/" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' @@ -109,6 +108,7 @@ runClient "clickhouse://default:@$CLICKHOUSE_HOST/" --port "$CLICKHOUSE_PORT_TCP runClient "clickhouse://$CLICKHOUSE_HOST/" --port "$CLICKHOUSE_PORT_TCP" 2>&1 | grep -o 'BAD_ARGUMENTS' runClient "clickhouse://:@$CLICKHOUSE_HOST/" --port "$CLICKHOUSE_PORT_TCP" 2>&1 | grep -o 'BAD_ARGUMENTS' runClient "clickhouse://$CLICKHOUSE_HOST/" --port "$CLICKHOUSE_PORT_TCP" 2>&1 | grep -o 'BAD_ARGUMENTS' +runClient "clickhouse://" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' runClient "clickhouse:" --port "$CLICKHOUSE_PORT_TCP" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' runClient "clickhouse://" --port "$CLICKHOUSE_PORT_TCP" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' runClient "clickhouse:///" --port "$CLICKHOUSE_PORT_TCP" --host "$CLICKHOUSE_HOST" 2>&1 | grep -o 'BAD_ARGUMENTS' @@ -130,9 +130,9 @@ runClient "clickhouse://host1/ database:" 2>&1 | grep -o 'BAD_ARGUMENTS' runClient "clickhouse://user :password@host1" 2>&1 | grep -o 'BAD_ARGUMENTS' runClient "clickhouse://user: password@host1" 2>&1 | grep -o 'BAD_ARGUMENTS' -# Query is not first argument +# Connection string is not first argument runClient --multiline "clickhouse://default:@$CLICKHOUSE_HOST/" 2>&1 | grep -o 'BAD_ARGUMENTS' -# Query used as the first and the second argument of client +# Connection string used as the first and the second argument of client runClient "clickhouse://default:@$CLICKHOUSE_HOST/" "clickhouse://default:@$CLICKHOUSE_HOST/" 2>&1 | grep -o 'BAD_ARGUMENTS' # Invalid hosts From 29b9cba75c18e23f9ee2eb589e5a69e7f46a5054 Mon Sep 17 00:00:00 2001 From: santrancisco Date: Wed, 14 Jun 2023 11:31:09 +1000 Subject: [PATCH 0848/1072] Update CMakeLists.txt with help from Nikita --- contrib/arrow-cmake/CMakeLists.txt | 68 +++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt index 16198887075..5fe942d1cd0 100644 --- a/contrib/arrow-cmake/CMakeLists.txt +++ b/contrib/arrow-cmake/CMakeLists.txt @@ -116,43 +116,79 @@ configure_file("${ORC_SOURCE_SRC_DIR}/Adaptor.hh.in" "${ORC_BUILD_INCLUDE_DIR}/A # ARROW_ORC + adapters/orc/CMakefiles set(ORC_SRCS "${CMAKE_CURRENT_BINARY_DIR}/orc_proto.pb.h" - "${ORC_SOURCE_SRC_DIR}/sargs/ExpressionTree.cc" - "${ORC_SOURCE_SRC_DIR}/sargs/Literal.cc" - "${ORC_SOURCE_SRC_DIR}/sargs/PredicateLeaf.cc" - "${ORC_SOURCE_SRC_DIR}/sargs/SargsApplier.cc" - "${ORC_SOURCE_SRC_DIR}/sargs/SearchArgument.cc" - "${ORC_SOURCE_SRC_DIR}/sargs/TruthValue.cc" - "${ORC_SOURCE_SRC_DIR}/Exceptions.cc" - "${ORC_SOURCE_SRC_DIR}/OrcFile.cc" - "${ORC_SOURCE_SRC_DIR}/Reader.cc" + "${ORC_ADDITION_SOURCE_DIR}/orc_proto.pb.cc" + "${ORC_SOURCE_SRC_DIR}/Adaptor.cc" + "${ORC_SOURCE_SRC_DIR}/Adaptor.hh.in" + "${ORC_SOURCE_SRC_DIR}/BlockBuffer.cc" + "${ORC_SOURCE_SRC_DIR}/BlockBuffer.hh" + "${ORC_SOURCE_SRC_DIR}/BloomFilter.cc" + "${ORC_SOURCE_SRC_DIR}/BloomFilter.hh" + "${ORC_SOURCE_SRC_DIR}/Bpacking.hh" + "${ORC_SOURCE_SRC_DIR}/BpackingDefault.cc" + "${ORC_SOURCE_SRC_DIR}/BpackingDefault.hh" "${ORC_SOURCE_SRC_DIR}/ByteRLE.cc" + "${ORC_SOURCE_SRC_DIR}/ByteRLE.hh" + "${ORC_SOURCE_SRC_DIR}/CMakeLists.txt" "${ORC_SOURCE_SRC_DIR}/ColumnPrinter.cc" "${ORC_SOURCE_SRC_DIR}/ColumnReader.cc" + "${ORC_SOURCE_SRC_DIR}/ColumnReader.hh" "${ORC_SOURCE_SRC_DIR}/ColumnWriter.cc" + "${ORC_SOURCE_SRC_DIR}/ColumnWriter.hh" "${ORC_SOURCE_SRC_DIR}/Common.cc" "${ORC_SOURCE_SRC_DIR}/Compression.cc" + "${ORC_SOURCE_SRC_DIR}/Compression.hh" + "${ORC_SOURCE_SRC_DIR}/ConvertColumnReader.cc" + "${ORC_SOURCE_SRC_DIR}/ConvertColumnReader.hh" + "${ORC_SOURCE_SRC_DIR}/CpuInfoUtil.cc" + "${ORC_SOURCE_SRC_DIR}/CpuInfoUtil.hh" + "${ORC_SOURCE_SRC_DIR}/Dispatch.hh" + "${ORC_SOURCE_SRC_DIR}/Exceptions.cc" "${ORC_SOURCE_SRC_DIR}/Int128.cc" "${ORC_SOURCE_SRC_DIR}/LzoDecompressor.cc" + "${ORC_SOURCE_SRC_DIR}/LzoDecompressor.hh" "${ORC_SOURCE_SRC_DIR}/MemoryPool.cc" + "${ORC_SOURCE_SRC_DIR}/Murmur3.cc" + "${ORC_SOURCE_SRC_DIR}/Murmur3.hh" + "${ORC_SOURCE_SRC_DIR}/Options.hh" + "${ORC_SOURCE_SRC_DIR}/OrcFile.cc" "${ORC_SOURCE_SRC_DIR}/RLE.cc" + "${ORC_SOURCE_SRC_DIR}/RLE.hh" + "${ORC_SOURCE_SRC_DIR}/RLEV2Util.cc" + "${ORC_SOURCE_SRC_DIR}/RLEV2Util.hh" "${ORC_SOURCE_SRC_DIR}/RLEv1.cc" + "${ORC_SOURCE_SRC_DIR}/RLEv1.hh" + "${ORC_SOURCE_SRC_DIR}/RLEv2.hh" + "${ORC_SOURCE_SRC_DIR}/Reader.cc" + "${ORC_SOURCE_SRC_DIR}/Reader.hh" "${ORC_SOURCE_SRC_DIR}/RleDecoderV2.cc" "${ORC_SOURCE_SRC_DIR}/RleEncoderV2.cc" - "${ORC_SOURCE_SRC_DIR}/RLEV2Util.cc" + "${ORC_SOURCE_SRC_DIR}/SchemaEvolution.cc" + "${ORC_SOURCE_SRC_DIR}/SchemaEvolution.hh" "${ORC_SOURCE_SRC_DIR}/Statistics.cc" + "${ORC_SOURCE_SRC_DIR}/Statistics.hh" "${ORC_SOURCE_SRC_DIR}/StripeStream.cc" + "${ORC_SOURCE_SRC_DIR}/StripeStream.hh" "${ORC_SOURCE_SRC_DIR}/Timezone.cc" + "${ORC_SOURCE_SRC_DIR}/Timezone.hh" "${ORC_SOURCE_SRC_DIR}/TypeImpl.cc" + "${ORC_SOURCE_SRC_DIR}/TypeImpl.hh" + "${ORC_SOURCE_SRC_DIR}/Utils.hh" "${ORC_SOURCE_SRC_DIR}/Vector.cc" "${ORC_SOURCE_SRC_DIR}/Writer.cc" - "${ORC_SOURCE_SRC_DIR}/Adaptor.cc" - "${ORC_SOURCE_SRC_DIR}/BloomFilter.cc" - "${ORC_SOURCE_SRC_DIR}/Murmur3.cc" - "${ORC_SOURCE_SRC_DIR}/BlockBuffer.cc" - "${ORC_SOURCE_SRC_DIR}/wrap/orc-proto-wrapper.cc" "${ORC_SOURCE_SRC_DIR}/io/InputStream.cc" + "${ORC_SOURCE_SRC_DIR}/io/InputStream.hh" "${ORC_SOURCE_SRC_DIR}/io/OutputStream.cc" - "${ORC_ADDITION_SOURCE_DIR}/orc_proto.pb.cc" + "${ORC_SOURCE_SRC_DIR}/io/OutputStream.hh" + "${ORC_SOURCE_SRC_DIR}/sargs/ExpressionTree.cc" + "${ORC_SOURCE_SRC_DIR}/sargs/ExpressionTree.hh" + "${ORC_SOURCE_SRC_DIR}/sargs/Literal.cc" + "${ORC_SOURCE_SRC_DIR}/sargs/PredicateLeaf.cc" + "${ORC_SOURCE_SRC_DIR}/sargs/PredicateLeaf.hh" + "${ORC_SOURCE_SRC_DIR}/sargs/SargsApplier.cc" + "${ORC_SOURCE_SRC_DIR}/sargs/SargsApplier.hh" + "${ORC_SOURCE_SRC_DIR}/sargs/SearchArgument.cc" + "${ORC_SOURCE_SRC_DIR}/sargs/SearchArgument.hh" + "${ORC_SOURCE_SRC_DIR}/sargs/TruthValue.cc" ) add_library(_orc ${ORC_SRCS}) From 1f76d0874398f4985bb00dd16caf68a772c95502 Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Wed, 14 Jun 2023 01:37:58 +0000 Subject: [PATCH 0849/1072] fixed --password "" --password issue --- programs/client/Client.cpp | 14 +++++++------- .../01317_no_password_in_command_line.reference | 1 + .../01317_no_password_in_command_line.sh | 2 ++ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 32a07284d26..1e2696b4910 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -977,13 +977,7 @@ void Client::addOptions(OptionsDescription & options_description) ("connection", po::value(), "connection to use (from the client config), by default connection name is hostname") ("secure,s", "Use TLS connection") ("user,u", po::value()->default_value("default"), "user") - /** If "--password [value]" is used but the value is omitted, the bad argument exception will be thrown. - * implicit_value is used to avoid this exception (to allow user to type just "--password") - * Since currently boost provides no way to check if a value has been set implicitly for an option, - * the "\n" is used to distinguish this case because there is hardly a chance a user would use "\n" - * as the password. - */ - ("password", po::value()->implicit_value("\n", ""), "password") + ("password", po::value(), "password") ("ask-password", "ask-password") ("quota_key", po::value(), "A string to differentiate quotas when the user have keyed quotas configured on server") @@ -1391,6 +1385,12 @@ void Client::readArguments( arg = argv[arg_num]; addMultiquery(arg, common_arguments); } + else if (arg == "--password" && ((arg_num + 1) >= argc || std::string_view(argv[arg_num + 1]).starts_with('-'))) + { + common_arguments.emplace_back(arg); + // Add implicit value to the password. '\n' means client should ask user for password. + common_arguments.emplace_back("\n"); + } else common_arguments.emplace_back(arg); } diff --git a/tests/queries/0_stateless/01317_no_password_in_command_line.reference b/tests/queries/0_stateless/01317_no_password_in_command_line.reference index e69de29bb2d..8f2f637d5e3 100644 --- a/tests/queries/0_stateless/01317_no_password_in_command_line.reference +++ b/tests/queries/0_stateless/01317_no_password_in_command_line.reference @@ -0,0 +1 @@ +Bad arguments diff --git a/tests/queries/0_stateless/01317_no_password_in_command_line.sh b/tests/queries/0_stateless/01317_no_password_in_command_line.sh index 7f2e91201a3..fc5b8997636 100755 --- a/tests/queries/0_stateless/01317_no_password_in_command_line.sh +++ b/tests/queries/0_stateless/01317_no_password_in_command_line.sh @@ -45,3 +45,5 @@ ps u --no-header $bg_query | grep -F -- '--password' | grep -F hello ||: grep -F -- '--password' < "/proc/$bg_query/comm" | grep -F hello ||: $CLICKHOUSE_CLIENT --format Null --param_query_id "$query_id" -q "KILL QUERY WHERE query_id = {query_id:String} SYNC" wait + +$CLICKHOUSE_CLIENT --user "$user" --password=hello --password -q 'select currentUser()' 2>&1 | grep -o 'Bad arguments' From f3b99156aca66e5df89181d224ae05a28f80f257 Mon Sep 17 00:00:00 2001 From: kevinyhzou Date: Wed, 14 Jun 2023 10:48:21 +0800 Subject: [PATCH 0850/1072] review fix --- docs/en/interfaces/formats.md | 2 +- .../operations/settings/settings-formats.md | 8 ++-- src/Core/Settings.h | 2 +- src/Formats/FormatFactory.cpp | 2 +- src/Formats/FormatSettings.h | 2 +- .../Formats/Impl/CSVRowInputFormat.cpp | 38 ++++++++++--------- ...ext_with_whitespace_tab_field_delimiter.sh | 4 +- 7 files changed, 31 insertions(+), 27 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 57962c1d730..da1ba17cbb7 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -468,7 +468,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe - [input_format_csv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_first_lines) - skip the specified number of lines at the beginning of data. Default value - `0`. - [input_format_csv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_csv_detect_header) - automatically detect header with names and types in CSV format. Default value - `true`. - [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`. -- [input_format_csv_use_whitespace_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_use_whitespace_tab_as_delimiter) - use whitespace or tab as field delimiter in CSV strings. Default value - `false`. +- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`. ## CSVWithNames {#csvwithnames} diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 0e30c8f319e..daf27622d3a 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -914,9 +914,9 @@ Result " string " ``` -### input_format_csv_use_whitespace_tab_as_delimiter {#input_format_csv_use_whitespace_tab_as_delimiter} +### input_format_csv_allow_whitespace_or_tab_as_delimiter {#input_format_csv_allow_whitespace_or_tab_as_delimiter} -Use whitespace or tab as field delimiter in CSV strings. +Allow to use whitespace or tab as field delimiter in CSV strings. Default value: `false`. @@ -925,7 +925,7 @@ Default value: `false`. Query ```bash -echo 'a b' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_use_whitespace_tab_as_delimiter=true --format_csv_delimiter=' ' +echo 'a b' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_allow_whitespace_or_tab_as_delimiter=true --format_csv_delimiter=' ' ``` Result @@ -937,7 +937,7 @@ a b Query ```bash -echo 'a b' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_use_whitespace_tab_as_delimiter=true --format_csv_delimiter='\t' +echo 'a b' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_allow_whitespace_or_tab_as_delimiter=true --format_csv_delimiter='\t' ``` Result diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 4306ac855a3..1d889d8b0c3 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -850,7 +850,7 @@ class IColumn; M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \ M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \ M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \ - M(Bool, input_format_csv_use_whitespace_tab_as_delimiter, false, "Use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \ + M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \ M(Bool, input_format_csv_trim_whitespaces, true, "Trims spaces and tabs (\\t) characters at the beginning and end in CSV strings", 0) \ M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \ M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 33ecddfc223..81528937b13 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -70,7 +70,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.skip_first_lines = settings.input_format_csv_skip_first_lines; format_settings.csv.try_detect_header = settings.input_format_csv_detect_header; format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces; - format_settings.csv.use_whitespace_tab_as_delimiter = settings.input_format_csv_use_whitespace_tab_as_delimiter; + format_settings.csv.allow_whitespace_or_tab_as_delimiter = settings.input_format_csv_allow_whitespace_or_tab_as_delimiter; format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter; format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter; format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 72d60e8423e..5dc1a14a12c 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -137,7 +137,7 @@ struct FormatSettings String custom_delimiter; bool try_detect_header = true; bool trim_whitespaces = true; - bool use_whitespace_tab_as_delimiter = false; + bool allow_whitespace_or_tab_as_delimiter = false; } csv; struct HiveText diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index b8d3413f863..181949b3bb7 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -25,10 +25,14 @@ namespace ErrorCodes namespace { - void checkBadDelimiter(char delimiter, bool use_whitespace_tab_as_delimiter) + void checkBadDelimiter(char delimiter, bool allow_whitespace_or_tab_as_delimiter) { + if ((delimiter == ' ' || delimiter == '\t') && allow_whitespace_or_tab_as_delimiter) + { + return; + } constexpr std::string_view bad_delimiters = " \t\"'.UL"; - if (bad_delimiters.find(delimiter) != std::string_view::npos && !use_whitespace_tab_as_delimiter) + if (bad_delimiters.find(delimiter) != std::string_view::npos) throw Exception( ErrorCodes::BAD_ARGUMENTS, "CSV format may not work correctly with delimiter '{}'. Try use CustomSeparated format instead", @@ -68,7 +72,7 @@ CSVRowInputFormat::CSVRowInputFormat( format_settings_.csv.try_detect_header), buf(std::move(in_)) { - checkBadDelimiter(format_settings_.csv.delimiter, format_settings_.csv.use_whitespace_tab_as_delimiter); + checkBadDelimiter(format_settings_.csv.delimiter, format_settings_.csv.allow_whitespace_or_tab_as_delimiter); } CSVRowInputFormat::CSVRowInputFormat( @@ -90,7 +94,7 @@ CSVRowInputFormat::CSVRowInputFormat( format_settings_.csv.try_detect_header), buf(std::move(in_)) { - checkBadDelimiter(format_settings_.csv.delimiter, format_settings_.csv.use_whitespace_tab_as_delimiter); + checkBadDelimiter(format_settings_.csv.delimiter, format_settings_.csv.allow_whitespace_or_tab_as_delimiter); } void CSVRowInputFormat::syncAfterError() @@ -134,9 +138,9 @@ static void skipEndOfLine(ReadBuffer & in) } /// Skip `whitespace` symbols allowed in CSV. -static inline void skipWhitespacesAndTabs(ReadBuffer & in, const bool & use_whitespace_tab_as_delimiter) +static inline void skipWhitespacesAndTabs(ReadBuffer & in, const bool & allow_whitespace_or_tab_as_delimiter) { - if (use_whitespace_tab_as_delimiter) + if (allow_whitespace_or_tab_as_delimiter) { return; } @@ -150,7 +154,7 @@ CSVFormatReader::CSVFormatReader(PeekableReadBuffer & buf_, const FormatSettings void CSVFormatReader::skipFieldDelimiter() { - skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); + skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); assertChar(format_settings.csv.delimiter, *buf); } @@ -158,7 +162,7 @@ template String CSVFormatReader::readCSVFieldIntoString() { if (format_settings.csv.trim_whitespaces) [[likely]] - skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); + skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); String field; if constexpr (read_string) @@ -170,14 +174,14 @@ String CSVFormatReader::readCSVFieldIntoString() void CSVFormatReader::skipField() { - skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); + skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); NullOutput out; readCSVStringInto(out, *buf, format_settings.csv); } void CSVFormatReader::skipRowEndDelimiter() { - skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); + skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); if (buf->eof()) return; @@ -186,7 +190,7 @@ void CSVFormatReader::skipRowEndDelimiter() if (*buf->position() == format_settings.csv.delimiter) ++buf->position(); - skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); + skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); if (buf->eof()) return; @@ -198,7 +202,7 @@ void CSVFormatReader::skipHeaderRow() do { skipField(); - skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); + skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); } while (checkChar(format_settings.csv.delimiter, *buf)); skipRowEndDelimiter(); @@ -211,7 +215,7 @@ std::vector CSVFormatReader::readRowImpl() do { fields.push_back(readCSVFieldIntoString()); - skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); + skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); } while (checkChar(format_settings.csv.delimiter, *buf)); skipRowEndDelimiter(); @@ -224,7 +228,7 @@ bool CSVFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) try { - skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); + skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); assertChar(delimiter, *buf); } catch (const DB::Exception &) @@ -250,7 +254,7 @@ bool CSVFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { - skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); + skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); if (buf->eof()) return true; @@ -259,7 +263,7 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) if (*buf->position() == format_settings.csv.delimiter) { ++buf->position(); - skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); + skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); if (buf->eof()) return true; } @@ -287,7 +291,7 @@ bool CSVFormatReader::readField( const String & /*column_name*/) { if (format_settings.csv.trim_whitespaces || !isStringOrFixedString(removeNullable(type))) [[likely]] - skipWhitespacesAndTabs(*buf, format_settings.csv.use_whitespace_tab_as_delimiter); + skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); const bool at_delimiter = !buf->eof() && *buf->position() == format_settings.csv.delimiter; const bool at_last_column_line_end = is_last_file_column && (buf->eof() || *buf->position() == '\n' || *buf->position() == '\r'); diff --git a/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.sh b/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.sh index deb6e317aac..6fca95cb839 100755 --- a/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.sh +++ b/tests/queries/0_stateless/02785_text_with_whitespace_tab_field_delimiter.sh @@ -10,8 +10,8 @@ $CLICKHOUSE_CLIENT -q "drop table if exists test_whitespace" $CLICKHOUSE_CLIENT -q "drop table if exists test_tab" $CLICKHOUSE_CLIENT -q "create table test_whitespace (x UInt32, y String, z String) engine=MergeTree order by x" $CLICKHOUSE_CLIENT -q "create table test_tab (x UInt32, y String, z String) engine=MergeTree order by x" -cat $CURDIR/data_csv/csv_with_space_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_whitespace SETTINGS format_csv_delimiter=' ', input_format_csv_use_whitespace_tab_as_delimiter=true FORMAT CSV" -cat $CURDIR/data_csv/csv_with_tab_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tab SETTINGS format_csv_delimiter='\t', input_format_csv_use_whitespace_tab_as_delimiter=true FORMAT CSV" +cat $CURDIR/data_csv/csv_with_space_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_whitespace SETTINGS format_csv_delimiter=' ', input_format_csv_allow_whitespace_or_tab_as_delimiter=true FORMAT CSV" +cat $CURDIR/data_csv/csv_with_tab_delimiter.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tab SETTINGS format_csv_delimiter='\t', input_format_csv_allow_whitespace_or_tab_as_delimiter=true FORMAT CSV" $CLICKHOUSE_CLIENT -q "select * from test_whitespace" $CLICKHOUSE_CLIENT -q "select * from test_tab" $CLICKHOUSE_CLIENT -q "drop table test_whitespace" From 6ffdfb8b6b8656dfb2ef004349a3cad82dd03e1f Mon Sep 17 00:00:00 2001 From: santrancisco Date: Wed, 14 Jun 2023 13:29:05 +1000 Subject: [PATCH 0851/1072] test removing CpuInfoUtil.cc and see if build breaks :p --- contrib/arrow-cmake/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt index 5fe942d1cd0..01e9fc5fca9 100644 --- a/contrib/arrow-cmake/CMakeLists.txt +++ b/contrib/arrow-cmake/CMakeLists.txt @@ -139,8 +139,6 @@ set(ORC_SRCS "${ORC_SOURCE_SRC_DIR}/Compression.hh" "${ORC_SOURCE_SRC_DIR}/ConvertColumnReader.cc" "${ORC_SOURCE_SRC_DIR}/ConvertColumnReader.hh" - "${ORC_SOURCE_SRC_DIR}/CpuInfoUtil.cc" - "${ORC_SOURCE_SRC_DIR}/CpuInfoUtil.hh" "${ORC_SOURCE_SRC_DIR}/Dispatch.hh" "${ORC_SOURCE_SRC_DIR}/Exceptions.cc" "${ORC_SOURCE_SRC_DIR}/Int128.cc" From 868c3bd45d8585b03a9afce76e0e7466e675c420 Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Wed, 14 Jun 2023 04:29:08 +0000 Subject: [PATCH 0852/1072] minor change --- src/Client/ConnectionString.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Client/ConnectionString.cpp b/src/Client/ConnectionString.cpp index 62090487490..f4a4e73c198 100644 --- a/src/Client/ConnectionString.cpp +++ b/src/Client/ConnectionString.cpp @@ -45,7 +45,7 @@ std::string uriDecode(const std::string & uri_encoded_string, bool plus_as_space void getHostAndPort(const Poco::URI & uri, std::vector> & hosts_and_ports_arguments) { std::vector host_and_port; - const auto& host = uri.getHost(); + const std::string & host = uri.getHost(); if (!host.empty()) { host_and_port.push_back("--host=" + uriDecode(host, false)); From e281026e0085048e8d35d8cca78cc346501ee974 Mon Sep 17 00:00:00 2001 From: Chang Chen Date: Wed, 14 Jun 2023 12:29:55 +0800 Subject: [PATCH 0853/1072] fix build issue on clang 15 --- src/Formats/CapnProtoSerializer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp index b306cca4f94..8373c95599f 100644 --- a/src/Formats/CapnProtoSerializer.cpp +++ b/src/Formats/CapnProtoSerializer.cpp @@ -751,7 +751,7 @@ namespace private: using Reader = typename CapnpType::Reader; - CapnpType::Reader getData(const ColumnPtr & column, size_t row_num) + typename CapnpType::Reader getData(const ColumnPtr & column, size_t row_num) { auto data = column->getDataAt(row_num); if constexpr (std::is_same_v) @@ -801,7 +801,7 @@ namespace private: using Reader = typename CapnpType::Reader; - CapnpType::Reader getData(const ColumnPtr & column, size_t row_num) + typename CapnpType::Reader getData(const ColumnPtr & column, size_t row_num) { auto data = column->getDataAt(row_num); if constexpr (std::is_same_v) From 4db8fa39c7904661a7aac6aa62ee4f0e44092369 Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Wed, 14 Jun 2023 04:38:46 +0000 Subject: [PATCH 0854/1072] Removed extra lines --- docs/en/interfaces/cli.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index e2c7dc1e608..8779dd1a544 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -228,9 +228,6 @@ clickhouse://[2001:db8::1234] URI allows multiple hosts to be connected to. Connection strings can contain multiple hosts. ClickHouse-client will try to connect to these hosts in order (i.e. from left to right). After the connection is established, no attempt to connect to the remaining hosts is made. - - - The connection string must be specified as the first argument of clickhouse-client. The connection string can be combined with arbitrary other [command-line-options](#command-line-options) except `--host/-h` and `--port`. The following keys are allowed for component `query_parameter`: From 08cd94e826ce6af55135517d8abcbc72f4b270fb Mon Sep 17 00:00:00 2001 From: Derek Chia Date: Wed, 14 Jun 2023 12:57:50 +0800 Subject: [PATCH 0855/1072] Update delete.md LWD is not supported in table with projection --- docs/en/sql-reference/statements/delete.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/en/sql-reference/statements/delete.md b/docs/en/sql-reference/statements/delete.md index fa9f08e225f..5522c50d624 100644 --- a/docs/en/sql-reference/statements/delete.md +++ b/docs/en/sql-reference/statements/delete.md @@ -55,6 +55,9 @@ With the described implementation now we can see what can negatively affect 'DEL - Table having a very large number of data parts - Having a lot of data in Compact parts—in a Compact part, all columns are stored in one file. +:::note +Lightweight delete does not work for tables with projection as rows in projection may be affected and require the projection to be rebuilt. Rebuilding projection makes the deletion not lightweight, so this is not supported. +::: ## Related content From f74c585426f0f6dd7f2ce440193cda2c43a9f9d9 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Wed, 14 Jun 2023 07:46:11 +0200 Subject: [PATCH 0856/1072] Typos --- .../0_stateless/02784_projections_read_in_order_bug.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02784_projections_read_in_order_bug.sql b/tests/queries/0_stateless/02784_projections_read_in_order_bug.sql index 9595fc9ae08..6bf287a3d77 100644 --- a/tests/queries/0_stateless/02784_projections_read_in_order_bug.sql +++ b/tests/queries/0_stateless/02784_projections_read_in_order_bug.sql @@ -37,8 +37,8 @@ create table events ( timestamp )) engine = MergeTree order by (organisation_id, session_id, timestamp) settings index_granularity = 3; -insert into events values (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)); -insert into events values (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC')(), toString(0), reinterpretAsUUID(0), toString(0)); +insert into events values (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC'), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC'), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC'), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC'), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC'), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC'), toString(0), reinterpretAsUUID(0), toString(0)); +insert into events values (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC'), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(0), reinterpretAsUUID(1), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC'), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC'), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(1), reinterpretAsUUID(0), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC'), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC'), toString(0), reinterpretAsUUID(0), toString(0)), (reinterpretAsUUID(3), reinterpretAsUUID(2), reinterpretAsUUID(0), toDateTime('2022-02-02', 'UTC'), toString(0), reinterpretAsUUID(0), toString(0)); set read_in_order_two_level_merge_threshold=1; SELECT id, timestamp, payload FROM events WHERE (organisation_id = reinterpretAsUUID(1)) AND (session_id = reinterpretAsUUID(0)) ORDER BY timestamp, payload, id ASC; From 86694847c66a170781e5a561940c05e0221966d0 Mon Sep 17 00:00:00 2001 From: Chang Chen Date: Wed, 14 Jun 2023 15:22:32 +0800 Subject: [PATCH 0857/1072] using Reader instead of typename CapnpType::Reader --- src/Formats/CapnProtoSerializer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Formats/CapnProtoSerializer.cpp b/src/Formats/CapnProtoSerializer.cpp index 8373c95599f..6f7254ab2aa 100644 --- a/src/Formats/CapnProtoSerializer.cpp +++ b/src/Formats/CapnProtoSerializer.cpp @@ -751,7 +751,7 @@ namespace private: using Reader = typename CapnpType::Reader; - typename CapnpType::Reader getData(const ColumnPtr & column, size_t row_num) + Reader getData(const ColumnPtr & column, size_t row_num) { auto data = column->getDataAt(row_num); if constexpr (std::is_same_v) @@ -801,7 +801,7 @@ namespace private: using Reader = typename CapnpType::Reader; - typename CapnpType::Reader getData(const ColumnPtr & column, size_t row_num) + Reader getData(const ColumnPtr & column, size_t row_num) { auto data = column->getDataAt(row_num); if constexpr (std::is_same_v) From b1f0a91b788e2ef89cd5848168e8357d6da13f0a Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 14 Jun 2023 07:48:08 +0000 Subject: [PATCH 0858/1072] Docs: Fix embedded video link --- docs/en/engines/table-engines/mergetree-family/annindexes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index d8a0193ff66..80e47e76ce0 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -152,7 +152,7 @@ This type of ANN index implements [the Annoy algorithm](https://github.com/spoti space in random linear surfaces (lines in 2D, planes in 3D etc.).
-